Browse Source

feat(dnn/cuda): add convolution with i8 input and u4 output

GitOrigin-RevId: 8be439abf1
release-1.5
Megvii Engine Team huangxinda 4 years ago
parent
commit
b4687ce8da
71 changed files with 1912 additions and 75 deletions
  1. +2
    -2
      dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
  2. +1
    -1
      dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl
  3. +24
    -5
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
  4. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu
  5. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu
  6. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu
  7. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu
  8. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu
  9. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu
  10. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu
  11. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu
  12. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu
  13. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu
  14. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu
  15. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu
  16. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu
  17. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu
  18. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu
  19. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu
  20. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu
  21. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu
  22. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu
  23. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu
  24. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu
  25. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu
  26. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu
  27. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu
  28. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu
  29. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu
  30. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu
  31. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu
  32. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu
  33. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu
  34. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu
  35. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu
  36. +2
    -1
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu
  37. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu
  38. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu
  39. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu
  40. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu
  41. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu
  42. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu
  43. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu
  44. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu
  45. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu
  46. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu
  47. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu
  48. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu
  49. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu
  50. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu
  51. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu
  52. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu
  53. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu
  54. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu
  55. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu
  56. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu
  57. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu
  58. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu
  59. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu
  60. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu
  61. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu
  62. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu
  63. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu
  64. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu
  65. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu
  66. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu
  67. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu
  68. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu
  69. +55
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu
  70. +0
    -1
      src/gopt/impl/tensor_reformat.cpp
  71. +4
    -33
      src/gopt/test/inference.cpp

+ 2
- 2
dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu View File

@@ -960,7 +960,7 @@ void megdnn::cuda::cutlass_wrapper::
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \
cutlass::conv::threadblock:: \
ConvolutionFpropNCxHWxThreadblockSwizzle, \
stages_, 4, aligned_, NeedLoadFromConstMem, \
stages_, 4, aligned_, true, \
cutlass::arch::OpMultiplyAddSaturate>; \
typename Convolution::ConvolutionParameter conv_param( \
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \
@@ -1020,7 +1020,7 @@ void megdnn::cuda::cutlass_wrapper::
ElementOutput, 8, ElementAccumulator, ElementBias,
ElementCompute>;
typename EpilogueOp::Params epilogue{alpha, beta, gamma,
scale, detla, theta};
scale, delta, theta};
DISPATCH_KERNEL;
}
default:


+ 1
- 1
dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl View File

@@ -1,6 +1,6 @@
/**
* \file
* dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl
* dnn/src/cuda/conv_bias/int8/implicit_gemm_conv_bias_cutlass_wrapper.cuinl
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved.


+ 24
- 5
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -181,6 +181,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
float alpha = src_scale * filter_scale;
float beta = 1.f;
float dst_scale = 1.f;
float gamma = 0.f;
float theta = 0.f;
if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
theta = args.dst_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
}
if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) {
megdnn_assert(args.dst_layout->dtype.category() ==
DTypeCategory::QUANTIZED);
@@ -189,7 +195,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
dst_scale = get_scale(args.dst_layout->dtype);
alpha /= dst_scale, beta = bias_scale / dst_scale;
}
float gamma = 0.f;
float delta = 0.f;
if (args.z_layout->ndim > 0) {
gamma = 1.f;
if (args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) {
@@ -198,6 +204,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
float z_scale = get_scale(args.z_layout->dtype);
gamma = z_scale / dst_scale;
}
if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) {
uint8_t z_zero =
args.z_layout->dtype.param<dtype::Quantized4Asymm>()
.zero_point;
delta = -z_zero * gamma;
}
}
uint32_t nonlinear_mode = static_cast<uint32_t>(param.nonlineMode);
bool nonunity_kernel = !(fh == 1 && fw == 1);
@@ -244,14 +256,15 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
DISPATCH(false);
#undef cb
} else if (param.format == Format::NCHW4_NHWC) {
#define cb(_nonunity_kernel) \
#define cb(_signedness) \
cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nhwc< \
_nonunity_kernel>( \
_signedness>( \
args.src_tensor->compatible_ptr<int8_t>(), filter_ptr, \
args.bias_tensor->compatible_ptr<int32_t>(), \
reinterpret_cast<int8_t*>(args.z_tensor->raw_ptr), \
reinterpret_cast<int8_t*>(args.dst_tensor->raw_ptr), nullptr, \
kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, \
kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, \
dst_scale, \
cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, \
m_algo_param.threadblock_n, \
m_algo_param.threadblock_k}, \
@@ -259,7 +272,13 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
m_algo_param.warp_n, \
m_algo_param.warp_k}, \
m_algo_param.stage, stream);
cb(true);
if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) {
cb(true);
} else {
megdnn_assert(args.dst_layout->dtype.enumv() ==
DTypeEnum::Quantized4Asymm);
cb(false);
}
#undef cb
} else {
megdnn_assert(param.format == Format::NCHW4_NCHW32);


+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 2
- 1
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolu
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1,
4,
8,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
4,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1,
4,
8,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
4,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<128, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<16, 128, 16>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
1,
4,
8,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<16, 64, 8>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
4,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 128, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<32, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<32, 64, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 128, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 55
- 0
dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu View File

@@ -0,0 +1,55 @@

#if !MEGDNN_TEGRA_X1
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl"


// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator
using Convolution =
typename cutlass::conv::device::Convolution<
int8_t,
cutlass::layout::TensorNCxHWx<4>,
int8_t,
cutlass::layout::TensorCxRSKx<4>,
cutlass::uint4b_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::layout::TensorNHWC,
int32_t,
cutlass::conv::ConvType::kConvolution,
cutlass::arch::OpClassSimt,
cutlass::arch::Sm75,
cutlass::gemm::GemmShape<64, 64, 32>,
cutlass::gemm::GemmShape<64, 32, 32>,
cutlass::gemm::GemmShape<1, 1, 4>,
cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
cutlass::uint4b_t,
8,
int32_t,
int32_t,
float
>,
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle,
2,
4,
16,
true,
cutlass::arch::OpMultiplyAddSaturate>;


template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream,
typename Convolution::ExtraParam extra_param);
#pragma GCC diagnostic pop
#endif

+ 0
- 1
src/gopt/impl/tensor_reformat.cpp View File

@@ -3801,7 +3801,6 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const {
return false;
auto in_dtype = typecvt->input(0)->dtype(),
out_dtype = typecvt->output(0)->dtype();
printf("%s, %s\n", in_dtype.name(), out_dtype.name());
bool is_s82s4 = in_dtype.enumv() == DTypeEnum::QuantizedS8 &&
(out_dtype.enumv() == DTypeEnum::QuantizedS4 ||
out_dtype.enumv() == DTypeEnum::Quantized4Asymm);


+ 4
- 33
src/gopt/test/inference.cpp View File

@@ -4159,14 +4159,7 @@ TEST(TestGoptInference, FoldingConvDimshuffle) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
@@ -4240,14 +4233,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 61) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 61);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
@@ -4326,14 +4312,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5);

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
@@ -4405,14 +4384,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) {
REQUIRE_GPU(1);
auto cn = CompNode::load("gpu0");
cn.activate();
auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
auto sm_ver = prop.major * 10 + prop.minor;
if (sm_ver < 75) {
printf("This testcast ignored due to insufficient cuda cap(got: %d, "
"expected: %d)\n",
sm_ver, 75);
return;
}
REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5);

HostTensorGenerator<dtype::Int8> gen;
auto graph = ComputingGraph::make();
@@ -4466,7 +4438,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) {
->writeto_fpath(output_file(
"TestGoptInference.FoldingConvDimshuffleNCHW4NHWC.json"));
size_t nr_dimshuffle = find_opr_num<opr::TypeCvt>(y_fuse);
printf("%zu \n", nr_dimshuffle);
ASSERT_EQ(3u, find_opr_num<opr::Dimshuffle>(y_fuse));
bool found = false;
cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {


Loading…
Cancel
Save