Browse Source

refactor(dnn/cuda): refactor kernel generator for cutlass convolution kernels

GitOrigin-RevId: 7882f9c68c
release-1.5
Megvii Engine Team huangxinda 4 years ago
parent
commit
4abf7bd36f
100 changed files with 557 additions and 3180 deletions
  1. +3
    -4
      dnn/scripts/Makefile
  2. +4
    -14
      dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
  3. +0
    -65
      dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl
  4. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu
  5. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu
  6. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu
  7. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu
  8. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu
  9. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu
  10. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu
  11. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu
  12. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu
  13. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu
  14. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu
  15. +0
    -36
      dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu
  16. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64.cu
  17. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu
  18. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64.cu
  19. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64.cu
  20. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64.cu
  21. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64.cu
  22. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu
  23. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu
  24. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu
  25. +55
    -0
      dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu
  26. +0
    -1
      dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl
  27. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu
  28. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu
  29. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu
  30. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu
  31. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu
  32. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu
  33. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu
  34. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu
  35. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu
  36. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu
  37. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu
  38. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu
  39. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu
  40. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu
  41. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu
  42. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu
  43. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu
  44. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu
  45. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu
  46. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu
  47. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu
  48. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu
  49. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu
  50. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu
  51. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu
  52. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu
  53. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu
  54. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu
  55. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu
  56. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu
  57. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu
  58. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu
  59. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu
  60. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu
  61. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu
  62. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu
  63. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu
  64. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu
  65. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu
  66. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu
  67. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu
  68. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu
  69. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu
  70. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu
  71. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu
  72. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu
  73. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu
  74. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu
  75. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu
  76. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu
  77. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu
  78. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu
  79. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu
  80. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu
  81. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu
  82. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu
  83. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu
  84. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu
  85. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu
  86. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu
  87. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu
  88. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu
  89. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu
  90. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu
  91. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu
  92. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu
  93. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu
  94. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu
  95. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu
  96. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu
  97. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu
  98. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu
  99. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu
  100. +0
    -36
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu

+ 3
- 4
dnn/scripts/Makefile View File

@@ -37,14 +37,13 @@ all: ${PARAM_DEFS} ${ELEMWISE_IMPL} ${CUDA_CONV_IMPL} $(CUDA_MATMUL_IMPL)
../src/cuda/elemwise_multi_type/kimpl: gen_elemwise_multi_type_kern_impls.py
./$^ --type cuda $@

../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py cutlass_generator/generator.py
../src/cuda/conv_bias/int8/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator
./gen_cuda_conv_bias_kern_impls.py --type dp4a $@
./gen_cutlass_conv_bias_kern_impls.py --type dp4a $@
python3 ./cutlass_generator/generator.py --operations all --type simt $@

../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py gen_cutlass_conv_bias_kern_impls.py
../src/cuda/conv_bias/int8_imma/kimpl: gen_cuda_conv_bias_kern_impls.py cutlass_generator
./gen_cuda_conv_bias_kern_impls.py --type imma $@
./gen_cutlass_conv_bias_kern_impls.py --type imma $@
python3 ./cutlass_generator/generator.py --operations conv2d --type tensorop8816 $@

../src/cuda/batch_conv_bias/int8/kimpl: gen_cuda_batch_conv_bias_kern_impls.py
./$^ --type dp4a $@


+ 4
- 14
dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu View File

@@ -807,9 +807,9 @@ void megdnn::cuda::cutlass_wrapper::
const int32_t* d_bias, const uint8_t* d_z, uint8_t* d_dst,
int* workspace, const convolution::ConvParam& param,
uint32_t nonlinear_mode, float alpha, float beta, float gamma,
float delta, float theta, float scale, uint8_t src_zero_point,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
cudaStream_t stream) {
float delta, float theta, float /* scale */,
uint8_t src_zero_point, const GemmCoord& threadblock_shape,
const GemmCoord& warp_shape, cudaStream_t stream) {
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \
threadblock_k_, warp_m_, warp_n_, \
warp_k_) \
@@ -878,15 +878,6 @@ void megdnn::cuda::cutlass_wrapper::
0, delta, theta};
DISPATCH_KERNEL;
}
case NonlineMode::H_SWISH: {
using EpilogueOp = cutlass::epilogue::thread::
BiasAddLinearCombinationHSwishClamp<
ElementOutput, 16, ElementAccumulator, ElementBias,
ElementCompute>;
typename EpilogueOp::Params epilogue{alpha, beta, gamma,
scale, delta, theta};
DISPATCH_KERNEL;
}
default:
megdnn_assert(false,
"unsupported nonlinear mode for conv bias operator");
@@ -960,8 +951,7 @@ void megdnn::cuda::cutlass_wrapper::
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, true, \
cutlass::arch::OpMultiplyAddSaturate>; \
stages_, 4, aligned_, true, cutlass::arch::OpMultiplyAdd>; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \


+ 0
- 65
dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl View File

@@ -1,65 +0,0 @@
/**
* \file
* dnn/src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied.
*/
#include "cutlass/convolution/device/convolution.h"
#include "src/cuda/conv_bias/cutlass_convolution_wrapper.cuh"

using namespace megdnn;
using namespace cuda;
using namespace cutlass_wrapper;

template <typename Convolution>
void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst, int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param) {
typename Convolution::TensorRefSrc tensor_src{
const_cast<typename Convolution::ElementSrc*>(d_src),
Convolution::LayoutSrc::packed(
{conv_param.N, conv_param.H, conv_param.W, conv_param.C})};
typename Convolution::TensorRefFilter tensor_filter{
const_cast<typename Convolution::ElementFilter*>(d_filter),
Convolution::LayoutFilter::packed(
{conv_param.K, conv_param.R, conv_param.S, conv_param.C})};
typename Convolution::TensorRefBias tensor_bias{
const_cast<typename Convolution::ElementBias*>(d_bias),
Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})};
typename Convolution::TensorRefDst tensor_z{
const_cast<typename Convolution::ElementDst*>(d_z),
Convolution::LayoutDst::packed(
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
typename Convolution::TensorRefDst tensor_dst{
d_dst,
Convolution::LayoutDst::packed(
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
typename Convolution::Arguments arguments{conv_param,
tensor_src.non_const_ref(),
tensor_filter.non_const_ref(),
tensor_bias.non_const_ref(),
tensor_z.non_const_ref(),
tensor_dst.non_const_ref(),
epilogue,
{},
{},
extra_param};
Convolution conv_op;
cutlass_check(conv_op.initialize(arguments, workspace));
cutlass_check(conv_op(stream));
after_kernel_launch();
}

// vim: syntax=cuda.doxygen

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_int4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::int4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::int4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_128x128x128_64x64x128_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int4/kimpl/conv_bias_uint4_int4_implicit_gemm_imma_ncdiv64hw64_256x128x128_64x64x128_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int4_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int4/conv_bias_int4_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<64>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<64>;
using LayoutDst = cutlass::layout::TensorNCxHWx<64>;
using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 128>;
using WarpShape = cutlass::gemm::GemmShape<64, 64, 128>;
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 32>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t, 16, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
cutlass::uint4b_t, LayoutSrc, cutlass::int4b_t, LayoutFilter, cutlass::uint4b_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 32, 32, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_hswish_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_identity_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_s4_i8832fprop_relu_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::int4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::int4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_identity_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_128x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int4/kimpl/cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_tensorop_u4_i8832fprop_relu_u4_s4_256x128x128_64x64x128_2_nc64hw64" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
cutlass::int4b_t,
cutlass::layout::TensorCxRSKx<64>,
cutlass::uint4b_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::layout::TensorNCxHWx<64>,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassTensorOp,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<256, 128, 128>,
cutlass::gemm::GemmShape<64, 64, 128>,
cutlass::gemm::GemmShape<8, 8, 32>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
16,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
32,
32,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 1
dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl View File

@@ -1 +0,0 @@
../implicit_gemm_conv_bias_cutlass_wrapper.cuinl

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>;
using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 36
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu View File

@@ -1,36 +0,0 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_int8_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<32>;
using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>;
using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream, typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save