Browse Source

feat(dnn/cuda): add deconv int8 and fix cutlass conv wrapper base on modify cutlass 2.4

GitOrigin-RevId: 49e0565e8a
tags/v1.3.0
Megvii Engine Team 4 years ago
parent
commit
5d350fc843
100 changed files with 382 additions and 461 deletions
  1. +1
    -0
      .gitattributes
  2. +28
    -9
      dnn/src/common/convolution.cpp
  3. +49
    -54
      dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
  4. +16
    -14
      dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl
  5. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu
  6. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu
  7. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu
  8. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu
  9. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu
  10. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu
  11. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu
  12. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu
  13. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu
  14. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu
  15. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu
  16. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu
  17. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu
  18. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu
  19. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu
  20. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu
  21. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu
  22. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu
  23. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu
  24. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu
  25. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu
  26. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu
  27. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu
  28. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu
  29. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu
  30. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu
  31. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu
  32. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu
  33. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu
  34. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu
  35. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu
  36. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu
  37. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu
  38. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu
  39. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu
  40. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu
  41. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu
  42. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu
  43. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu
  44. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu
  45. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu
  46. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu
  47. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu
  48. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu
  49. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu
  50. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu
  51. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu
  52. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu
  53. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu
  54. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu
  55. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu
  56. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu
  57. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu
  58. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu
  59. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu
  60. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu
  61. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu
  62. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu
  63. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu
  64. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu
  65. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu
  66. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu
  67. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu
  68. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu
  69. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu
  70. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu
  71. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu
  72. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu
  73. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu
  74. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu
  75. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu
  76. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu
  77. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu
  78. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu
  79. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu
  80. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu
  81. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu
  82. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu
  83. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu
  84. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu
  85. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu
  86. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu
  87. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu
  88. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu
  89. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu
  90. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu
  91. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu
  92. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu
  93. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu
  94. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu
  95. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu
  96. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu
  97. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu
  98. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu
  99. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu
  100. +3
    -4
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu

+ 1
- 0
.gitattributes View File

@@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary
dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary
dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text
*.caffemodel filter=lfs diff=lfs merge=lfs -text
imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text


+ 28
- 9
dnn/src/common/convolution.cpp View File

@@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc(
size_t src_ndim, const TensorLayout& filter, const Param& param,
typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) {
megdnn_assert(param.format == Param::Format::NCHW ||
param.format == Param::Format::NHWC );
param.format == Param::Format::NHWC);
auto img_ndim = src_ndim - 2;
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos;
if (param.sparse == Param::Sparse::DENSE) {
@@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx(
img_ndim, filter.ndim);
megdnn_assert((filter[filter.ndim - 1] == pack_size &&
filter[filter.ndim - 2] == pack_size) ||
(filter[filter.ndim - 1] == 2 * pack_size &&
filter[filter.ndim - 2] == 2 * pack_size),
(filter[filter.ndim - 1] == 2 * pack_size &&
filter[filter.ndim - 2] == 2 * pack_size),
"last 2 dim of filter must be %zu, but got %zu, %zu",
pack_size, filter[filter.ndim - 2],
filter[filter.ndim - 1]);
@@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
}
if (param().format == Param::Format::NCHW44 ||
param().format == Param::Format::NCHW44_DOT) {
//!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul
//! support nchw44 filter change to 88 for int8 winogradf23_88 using
//! MK8 mamtul
megdnn_assert((src.ndim == 4 && filter.ndim == 5 &&
filter[filter.ndim - 1] == 4) ||
(src.ndim == 5 &&
@@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
"currently only convolution on 2D image is supported");
auto cflt = make_canonized_filter_meta(src.ndim, filter);
if (param().format == Param::Format::NCHW ||
param().format == Param::Format::NHWC ) {
param().format == Param::Format::NHWC) {
size_t src_or_dst_c_pos = 0;
size_t src_or_dst_spatial_start = 0;
if (param().format == Param::Format::NCHW) {
@@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src,
dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1],
cflt.stride[1], cflt.padding[1]);
dst[4] = 32;
} else if (param().format == Param::Format::NCHW88 ) {
} else if (param().format == Param::Format::NCHW88) {
megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8),
"invalid src ndim for NCHW88, expected=5 or 4, got=%zu",
src.ndim);
@@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff,
}
megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32
#if !MEGDNN_DISABLE_FLOAT16
|| filter.enumv() == DTypeEnum::Float16
|| filter.enumv() == DTypeEnum::BFloat16
|| filter.enumv() == DTypeEnum::Float16 ||
filter.enumv() == DTypeEnum::BFloat16
#endif
,
,
"ComputeMode::FLOAT32 is only available for Float16/BFloat16 "
"input / output.");
}
@@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter,
diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i],
cflt.stride[i], cflt.padding[i]);
}
} else if (param().format == Param::Format::NCHW4) {
megdnn_assert(diff.ndim == 5,
"valid diff ndim for NCHW4, expected=5, got=%zu",
diff.ndim);
megdnn_assert(cflt.group == 1, "%s", errmsg().c_str());
megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s",
errmsg().c_str());
grad.ndim = diff.ndim;
grad[0] = diff[0];
auto ic = cflt.icpg * cflt.group;
megdnn_assert(ic % 4 == 0);
grad[1] = ic / 4;
grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0],
cflt.padding[0]);
grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1],
cflt.padding[1]);
megdnn_assert(diff[4] == 4);
grad[4] = 4;
} else {
megdnn_assert(param().format == Param::Format::NHWCD4);
megdnn_assert(diff.ndim == 5,


+ 49
- 54
dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu View File

@@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
2, 16, 16, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
@@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
2, 16, 16, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
@@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::layout::TensorNCxHWx<4>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stage_, 4, aligned_, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
@@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \
cutlass::layout::TensorNCHW, float, \
cutlass::layout::TensorNCHW, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, NeedLoadFromConstMem, \
cutlass::arch::OpMultiplyAdd>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
@@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper::
threadblock_k_>; \
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \
using Convolution = cutlass::convolution::device::Convolution< \
using Convolution = cutlass::conv::device::Convolution< \
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::layout::TensorNCxHWx<32>, int32_t, \
cutlass::convolution::ConvType::kConvolution, \
cutlass::conv::ConvType::kConvolution, \
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
param.sw, param.ph, param.pw, 1, 1}; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \
return cutlass_convolution_wrapper<Convolution>( \
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \
epilogue, stream); \
}
#define DISPATCH_KERNEL \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \
megdnn_assert(false, \
"unsupported threadblock shape (%dx%dx%d) and warp shape " \
"(%dx%dx%d)", \


+ 16
- 14
dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl View File

@@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper(
cudaStream_t stream) {
typename Convolution::TensorRefSrc tensor_src{
const_cast<typename Convolution::ElementSrc*>(d_src),
Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(),
conv_param.wi(), conv_param.ci()})};
Convolution::LayoutSrc::packed(
{conv_param.N, conv_param.H, conv_param.W, conv_param.C})};
typename Convolution::TensorRefFilter tensor_filter{
const_cast<typename Convolution::ElementFilter*>(d_filter),
Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(),
conv_param.fw(),
conv_param.ci()})};
Convolution::LayoutFilter::packed(
{conv_param.K, conv_param.R, conv_param.S, conv_param.C})};
typename Convolution::TensorRefBias tensor_bias{
const_cast<typename Convolution::ElementBias*>(d_bias),
Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})};
Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})};
typename Convolution::TensorRefDst tensor_z{
const_cast<typename Convolution::ElementDst*>(d_z),
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(),
conv_param.wo(), conv_param.co()})};
Convolution::LayoutDst::packed(
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
typename Convolution::TensorRefDst tensor_dst{
d_dst,
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(),
conv_param.wo(), conv_param.co()})};
typename Convolution::Arguments arguments{
conv_param, tensor_src, tensor_filter,
tensor_bias, tensor_z, tensor_dst.non_const_ref(),
epilogue};
Convolution::LayoutDst::packed(
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})};
typename Convolution::Arguments arguments{conv_param,
tensor_src.non_const_ref(),
tensor_filter.non_const_ref(),
tensor_bias.non_const_ref(),
tensor_z.non_const_ref(),
tensor_dst.non_const_ref(),
epilogue};
Convolution conv_op;
cutlass_check(conv_op.initialize(arguments, workspace));
cutlass_check(conv_op(stream));


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 4, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


+ 3
- 4
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu View File

@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
using Convolution = cutlass::conv::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2, 4, 16, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(


Some files were not shown because too many files changed in this diff

Loading…
Cancel
Save