Browse Source

feat(dnn/cuda): add cutlass matmul using split k parallel

GitOrigin-RevId: 650209e35f
tags/v1.3.0
Megvii Engine Team 4 years ago
parent
commit
973d2a0ac2
100 changed files with 1965 additions and 97 deletions
  1. +2
    -2
      dnn/scripts/Makefile
  2. +20
    -0
      dnn/src/cuda/matrix_mul/algos.cpp
  3. +27
    -0
      dnn/src/cuda/matrix_mul/algos.h
  4. +76
    -0
      dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
  5. +145
    -45
      dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu
  6. +3
    -3
      dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh
  7. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu
  8. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu
  9. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu
  10. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu
  11. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu
  12. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu
  13. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu
  14. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu
  15. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu
  16. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu
  17. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu
  18. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu
  19. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu
  20. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu
  21. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu
  22. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu
  23. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu
  24. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu
  25. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu
  26. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu
  27. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu
  28. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu
  29. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu
  30. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu
  31. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu
  32. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu
  33. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu
  34. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu
  35. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu
  36. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu
  37. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu
  38. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu
  39. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu
  40. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu
  41. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu
  42. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu
  43. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu
  44. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu
  45. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu
  46. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu
  47. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu
  48. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu
  49. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu
  50. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu
  51. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu
  52. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu
  53. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu
  54. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu
  55. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu
  56. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu
  57. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu
  58. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu
  59. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu
  60. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu
  61. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu
  62. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu
  63. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu
  64. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu
  65. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu
  66. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu
  67. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu
  68. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu
  69. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu
  70. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu
  71. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu
  72. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu
  73. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu
  74. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu
  75. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu
  76. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu
  77. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu
  78. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu
  79. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu
  80. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu
  81. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu
  82. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu
  83. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu
  84. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu
  85. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu
  86. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu
  87. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu
  88. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu
  89. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu
  90. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu
  91. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu
  92. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu
  93. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu
  94. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu
  95. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu
  96. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu
  97. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu
  98. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu
  99. +3
    -1
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu
  100. +33
    -0
      dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu

+ 2
- 2
dnn/scripts/Makefile View File

@@ -9,9 +9,9 @@ ELEMWISE_IMPL := ../src/cuda/cond_take/kimpl \
../src/cuda/elemwise_multi_type/kimpl

CUDA_CONV_IMPL := ../src/cuda/conv_bias/int8/kimpl ../src/cuda/conv_bias/int8_imma/kimpl ../src/cuda/batch_conv_bias/int8/kimpl
CUDA_MATMUL_KIMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl
CUDA_MATMUL_IMPL := ../src/cuda/matrix_mul/fp32_simt/kimpl

all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_KIMPL)
all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL)

../src/common/elemwise/each_mode.inl: gen_elemwise_each_mode.py
./$^ $@


+ 20
- 0
dnn/src/cuda/matrix_mul/algos.cpp View File

@@ -37,6 +37,9 @@ MatrixMulForwardImpl::AlgoPack::AlgoPack() {
for (auto&& algo : simt_float32) {
all_algos.push_back(&algo);
}
for (auto&& algo : simt_float32_split_k) {
all_algos.push_back(&algo);
}

for (auto&& algo : all_algos) {
m_all_algos_map.emplace(algo->info().desc, algo);
@@ -62,6 +65,23 @@ void MatrixMulForwardImpl::AlgoPack::fill_cutlass_algos() {
simt_float32.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
simt_float32.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
simt_float32.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{64, 256, 8, 32, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{256, 64, 8, 64, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{32, 256, 8, 16, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{256, 32, 8, 64, 16, 8});
simt_float32_split_k.emplace_back(AlgoParam{128, 128, 8, 32, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{128, 64, 8, 64, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{64, 128, 8, 32, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{128, 32, 8, 64, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{32, 128, 8, 32, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{64, 64, 8, 32, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{32, 64, 8, 32, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{64, 32, 8, 64, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{32, 32, 8, 32, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{8, 32, 8, 8, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{16, 32, 8, 16, 32, 8});
simt_float32_split_k.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8});
simt_float32_split_k.emplace_back(AlgoParam{16, 128, 8, 16, 64, 8});
}

MatrixMulForwardImpl::AlgoPack MatrixMulForwardImpl::sm_algo_pack;


+ 27
- 0
dnn/src/cuda/matrix_mul/algos.h View File

@@ -43,6 +43,7 @@ public:
CUDA_NAIVE,
CUDA_BFLOAT16,
CUDA_FLOAT32_SIMT,
CUDA_FLOAT32_SIMT_SPLIT_K,
};
using Mapper = std::unordered_map<AlgorithmDesc, AlgoBase*>;

@@ -198,6 +199,31 @@ private:
std::string m_name;
};

class MatrixMulForwardImpl::AlgoFloat32SIMTSplitK final : public AlgoBase {
public:
using AlgoParam = MatrixMulForwardImpl::AlgoFloat32SIMT::AlgoParam;
AlgoFloat32SIMTSplitK(AlgoParam algo_param)
: m_algo_param{algo_param},
m_name{ssprintf("CUTLASS_FLOAT32_SIMT_SPLIT_K_%s",
m_algo_param.to_string().c_str())} {}
bool is_available(const SizeArgs& args) const override;
size_t get_workspace_in_bytes(const SizeArgs& args) const override;
const char* name() const override { return m_name.c_str(); }
void exec(const ExecArgs& args) const override;
bool is_reproducible() const override { return true; }
MEGDNN_DECL_ALGO_TYPE(CUDA_FLOAT32_SIMT_SPLIT_K)

std::string param() const override {
std::string ret;
serialize_write_pod(m_algo_param, ret);
return ret;
}

private:
AlgoParam m_algo_param;
std::string m_name;
};

class MatrixMulForwardImpl::AlgoPack : NonCopyableObj {
private:
AlgoBase::Mapper m_all_algos_map;
@@ -216,6 +242,7 @@ public:
AlgoBFloat16 bfloat16;
#endif
std::vector<AlgoFloat32SIMT> simt_float32;
std::vector<AlgoFloat32SIMTSplitK> simt_float32_split_k;
std::vector<AlgoBase*> all_algos;

const AlgoBase::Mapper& all_algos_map() const { return m_all_algos_map; }


+ 76
- 0
dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp View File

@@ -0,0 +1,76 @@
/**
* \file dnn/src/cuda/matrix_mul/cutlass_float32_simt_split_k.cpp
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/

#include "src/cuda/handle.h"
#include "src/cuda/matrix_mul/algos.h"
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace cutlass_wrapper;

bool MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::is_available(
const SizeArgs& args) const {
auto&& param = args.opr->param();
int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
k = args.layout_a.shape[param.transposeA ? 0 : 1];
return args.opr->param().format == param::MatrixMul::Format::DEFAULT &&
args.layout_a.dtype == dtype::Float32() &&
args.layout_b.dtype == dtype::Float32() &&
args.layout_c.dtype == dtype::Float32() && k > std::max(m, n);
}

size_t MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::get_workspace_in_bytes(
const SizeArgs& args) const {
size_t lda = args.layout_a.stride[0], ldb = args.layout_b.stride[0],
ldc = args.layout_c.stride[0];
auto&& param = args.opr->param();
int m = args.layout_c.shape[0], n = args.layout_c.shape[1],
k = args.layout_a.shape[param.transposeA ? 0 : 1];
GemmCoord problem_size{m, n, k};
int split_k_slices = k / std::max(m, n);
return cutlass_matrix_mul_float32_simt_get_workspace_size(
param.transposeA, lda, param.transposeB, ldb, ldc, problem_size,
1.f, 0.f,
GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n,
m_algo_param.threadblock_k},
GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n,
m_algo_param.warp_k},
split_k_slices);
}

void MatrixMulForwardImpl::AlgoFloat32SIMTSplitK::exec(
const ExecArgs& args) const {
size_t lda = args.tensor_a.layout.stride[0],
ldb = args.tensor_b.layout.stride[0],
ldc = args.tensor_c.layout.stride[0];
auto&& param = args.opr->param();
int m = args.tensor_c.layout.shape[0], n = args.tensor_c.layout.shape[1],
k = args.tensor_a.layout.shape[param.transposeA ? 0 : 1];
GemmCoord problem_size{m, n, k};
int split_k_slices = k / std::max(m, n);
auto&& stream = cuda_stream(args.opr->handle());
int* workspace = reinterpret_cast<int*>(args.workspace.raw_ptr);
return cutlass_matrix_mul_float32_simt(
args.tensor_a.ptr<dt_float32>(), param.transposeA, lda,
args.tensor_b.ptr<dt_float32>(), param.transposeB, ldb,
args.tensor_c.ptr<dt_float32>(), ldc, workspace, problem_size, 1.f,
0.f,
GemmCoord{m_algo_param.threadblock_m, m_algo_param.threadblock_n,
m_algo_param.threadblock_k},
GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n,
m_algo_param.warp_k},
stream, split_k_slices);
}

// vim: syntax=cpp.doxygen

+ 145
- 45
dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cu View File

@@ -18,6 +18,7 @@
#if __CUDACC_VER_MAJOR__ > 9 || \
(__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
#include "cutlass/gemm/device/gemm.h"
#include "cutlass/gemm/device/gemm_splitk_parallel.h"
#endif
#include "src/common/opr_param_defs_enumv.cuh"
#include "src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh"
@@ -62,14 +63,20 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
float* /* d_C */, size_t /* ldc */, int* /* workspace */,
GemmCoord const& /* problem_size */, float /* alpha */,
float /* beta */, const GemmCoord& /* threadblock_shape */,
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */,
int /* split_k_slices */) {}
#else
void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
const float* d_A, bool transpose_A, size_t lda, const float* d_B,
bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace,
GemmCoord const& problem_size, float alpha, float beta,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
cudaStream_t stream) {
cudaStream_t stream, int split_k_slices) {
static constexpr int kEpilogueElementsPerAccess = 1;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
float, kEpilogueElementsPerAccess, float, float>;
typename EpilogueOp::Params epilogue{alpha, beta};
if (split_k_slices == 1) {
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \
warp_k_) \
if (threadblock_shape.m() == threadblock_m_ && \
@@ -93,29 +100,67 @@ void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_float32_simt(
workspace, problem_size, \
epilogue, stream); \
}
static constexpr int kEpilogueElementsPerAccess = 1;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
float, kEpilogueElementsPerAccess, float, float>;
typename EpilogueOp::Params epilogue{alpha, beta};
if (!transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else if (!transpose_A && transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
} else if (transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
if (!transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else if (!transpose_A && transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
} else if (transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else {
megdnn_assert(transpose_A && transpose_B);
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
}
#undef cb
} else {
megdnn_assert(transpose_A && transpose_B);
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \
warp_k_) \
if (threadblock_shape.m() == threadblock_m_ && \
threadblock_shape.n() == threadblock_n_ && \
threadblock_shape.k() == threadblock_k_ && \
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \
warp_shape.k() == warp_k_) { \
using ThreadBlockShape = \
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \
using Gemm = cutlass::gemm::device::GemmSplitKParallel< \
float, LayoutA, float, LayoutB, float, \
cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \
cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \
InstructionShape, EpilogueOp>; \
return cutlass_matrix_mul_wrapper<Gemm>( \
d_A, lda, d_B, ldb, d_C, ldc, workspace, problem_size, \
epilogue, stream, split_k_slices); \
}
if (!transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else if (!transpose_A && transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
} else if (transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else {
megdnn_assert(transpose_A && transpose_B);
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
}
#undef cb
}
}
#endif

@@ -127,7 +172,7 @@ size_t megdnn::cuda::cutlass_wrapper::
bool /* transpose_B */, size_t /* ldb */, size_t /* ldc */,
GemmCoord const& /* problem_size */, float /* alpha */,
float /* beta */, const GemmCoord& /* threadblock_shape */,
const GemmCoord& /* warp_shape */) {
const GemmCoord& /* warp_shape */, int /* split_k_slices */) {
return 0;
}
#else
@@ -136,7 +181,12 @@ size_t megdnn::cuda::cutlass_wrapper::
bool transpose_A, size_t lda, bool transpose_B, size_t ldb,
size_t ldc, GemmCoord const& problem_size, float alpha,
float beta, const GemmCoord& threadblock_shape,
const GemmCoord& warp_shape) {
const GemmCoord& warp_shape, int split_k_slices) {
static constexpr int kEpilogueElementsPerAccess = 1;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
float, kEpilogueElementsPerAccess, float, float>;
typename EpilogueOp::Params epilogue{alpha, beta};
if (split_k_slices == 1) {
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \
warp_k_) \
if (threadblock_shape.m() == threadblock_m_ && \
@@ -169,30 +219,80 @@ size_t megdnn::cuda::cutlass_wrapper::
split_k_slices}; \
return Gemm::get_workspace_size(arguments); \
}
static constexpr int kEpilogueElementsPerAccess = 1;
static constexpr int split_k_slices = 1;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
float, kEpilogueElementsPerAccess, float, float>;
typename EpilogueOp::Params epilogue{alpha, beta};
if (!transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else if (!transpose_A && transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
} else if (transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
if (!transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else if (!transpose_A && transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
} else if (transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else {
megdnn_assert(transpose_A && transpose_B);
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
}
#undef cb
} else {
megdnn_assert(transpose_A && transpose_B);
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
#define cb(threadblock_m_, threadblock_n_, threadblock_k_, warp_m_, warp_n_, \
warp_k_) \
if (threadblock_shape.m() == threadblock_m_ && \
threadblock_shape.n() == threadblock_n_ && \
threadblock_shape.k() == threadblock_k_ && \
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \
warp_shape.k() == warp_k_) { \
using ThreadBlockShape = \
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>; \
using Gemm = cutlass::gemm::device::GemmSplitKParallel< \
float, LayoutA, float, LayoutB, float, \
cutlass::layout::RowMajor, float, cutlass::arch::OpClassSimt, \
cutlass::arch::Sm50, ThreadBlockShape, WarpShape, \
InstructionShape, EpilogueOp>; \
using TensorRefA = cutlass::TensorRef<typename Gemm::ElementA const, \
typename Gemm::LayoutA>; \
using TensorRefB = cutlass::TensorRef<typename Gemm::ElementB const, \
typename Gemm::LayoutB>; \
using TensorRefC = cutlass::TensorRef<typename Gemm::ElementC const, \
typename Gemm::LayoutC>; \
using TensorRefD = cutlass::TensorRef<typename Gemm::ElementC, \
typename Gemm::LayoutC>; \
TensorRefA tensor_A{nullptr, Gemm::LayoutA{static_cast<int>(lda)}}; \
TensorRefB tensor_B{nullptr, Gemm::LayoutB{static_cast<int>(ldb)}}; \
TensorRefC tensor_C{nullptr, Gemm::LayoutC{static_cast<int>(ldc)}}; \
TensorRefD tensor_D{nullptr, Gemm::LayoutC{static_cast<int>(ldc)}}; \
typename Gemm::Arguments arguments{problem_size, tensor_A, tensor_B, \
tensor_C, tensor_D, epilogue, \
split_k_slices}; \
return Gemm::get_workspace_size(arguments); \
}
if (!transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else if (!transpose_A && transpose_B) {
using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
} else if (transpose_A && !transpose_B) {
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
DISPATCH(cb)
} else {
megdnn_assert(transpose_A && transpose_B);
using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
DISPATCH(cb)
}
#undef cb
}
}
#endif



+ 3
- 3
dnn/src/cuda/matrix_mul/cutlass_matrix_mul_wrapper.cuh View File

@@ -26,19 +26,19 @@ void cutlass_matrix_mul_wrapper(
typename Gemm::ElementC* d_C, size_t ldc, int* workspace,
GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices = 1);

void cutlass_matrix_mul_float32_simt(
const float* d_A, bool transpose_A, size_t lda, const float* d_B,
bool transpose_B, size_t ldb, float* d_C, size_t ldc, int* workspace,
GemmCoord const& problem_size, float alpha, float beta,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices = 1);

size_t cutlass_matrix_mul_float32_simt_get_workspace_size(
bool transpose_A, size_t lda, bool transpose_B, size_t ldb, size_t ldc,
GemmCoord const& problem_size, float alpha, float beta,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape);
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, int split_k_slices = 1);

} // namespace cutlass_wrapper
} // namespace cuda


+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x128x8_32x64x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x32x8_64x32x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_128x64x8_64x32x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x128x8_16x64x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x32x8_16x32x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_16x64x8_16x64x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x32x8_64x16x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 16, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_256x64x8_64x32x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x128x8_32x64x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x256x8_16x64x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 256, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x32x8_32x32x8_tt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_nt_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::RowMajor;
using LayoutB = cutlass::layout::ColumnMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 3
- 1
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn.cu View File

@@ -5,6 +5,7 @@
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
@@ -28,6 +29,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

+ 33
- 0
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/matrix_mul_fp32_simt_32x64x8_32x64x8_tn_splitk_parallel.cu View File

@@ -0,0 +1,33 @@
#if __CUDACC_VER_MAJOR__ > 9 || (__CUDACC_VER_MAJOR__ == 9 && __CUDACC_VER_MINOR__ >= 2)
// generated by gen_cutlass_matrix_mul_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#pragma GCC diagnostic ignored "-Wuninitialized"
#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
#include "src/cuda/matrix_mul/fp32_simt/matrix_mul_float_simt_cutlass_wrapper.cuinl"

using LayoutA = cutlass::layout::ColumnMajor;
using LayoutB = cutlass::layout::RowMajor;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 1>;
using EpilogueOp = cutlass::epilogue::thread::LinearCombination<float, 1, float, float>;
using Gemm = cutlass::gemm::device::GemmSplitKParallel<
float, LayoutA,
float, LayoutB,
float, cutlass::layout::RowMajor, float,
cutlass::arch::OpClassSimt, cutlass::arch::Sm50,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp>;
template void megdnn::cuda::cutlass_wrapper::cutlass_matrix_mul_wrapper<Gemm>(
const typename Gemm::ElementA* d_A, size_t lda,
const typename Gemm::ElementB* d_B, size_t ldb,
typename Gemm::ElementC* d_C, size_t ldc,
int* workspace,
cutlass::gemm::GemmCoord const& problem_size,
typename Gemm::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, int split_k_slices);

#pragma GCC diagnostic pop
#endif

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save