From b4687ce8da766540455d0923788ad3e9fbc8f8e4 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 24 May 2021 19:03:53 +0800 Subject: [PATCH] feat(dnn/cuda): add convolution with i8 input and u4 output GitOrigin-RevId: 8be439abf1f448a6be33ea0e57c48e674c5c94c4 --- .../cuda/conv_bias/cutlass_convolution_wrapper.cu | 4 +- .../implicit_gemm_conv_bias_cutlass_wrapper.cuinl | 2 +- .../conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp | 29 ++++++++++-- ..._hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...p_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...p_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ..._hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | 3 +- ...prop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | 3 +- ...p_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | 3 +- ...op_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | 3 +- ...op_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | 3 +- ...p_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...op_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...op_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...dentity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...dentity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | 3 +- ...op_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | 3 +- ...identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | 3 +- ..._identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | 3 +- ..._identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | 3 +- ...identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ..._identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ..._identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...op_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...rop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...rop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...op_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | 3 +- ...ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | 3 +- ...rop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | 3 +- ...prop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | 3 +- ...prop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | 3 +- ...rop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...prop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ...prop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | 3 +- ..._hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...p_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...p_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ..._hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...prop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...p_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...p_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...dentity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...dentity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ..._identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ..._identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ..._identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ..._identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...rop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...rop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...op_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...rop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...prop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...prop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...rop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...prop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ ...prop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu | 55 ++++++++++++++++++++++ src/gopt/impl/tensor_reformat.cpp | 1 - src/gopt/test/inference.cpp | 37 ++------------- 71 files changed, 1912 insertions(+), 75 deletions(-) create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu create mode 100644 dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 4e3cba3e..7ad77f3e 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -960,7 +960,7 @@ void megdnn::cuda::cutlass_wrapper:: ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ cutlass::conv::threadblock:: \ ConvolutionFpropNCxHWxThreadblockSwizzle, \ - stages_, 4, aligned_, NeedLoadFromConstMem, \ + stages_, 4, aligned_, true, \ cutlass::arch::OpMultiplyAddSaturate>; \ typename Convolution::ConvolutionParameter conv_param( \ param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ @@ -1020,7 +1020,7 @@ void megdnn::cuda::cutlass_wrapper:: ElementOutput, 8, ElementAccumulator, ElementBias, ElementCompute>; typename EpilogueOp::Params epilogue{alpha, beta, gamma, - scale, detla, theta}; + scale, delta, theta}; DISPATCH_KERNEL; } default: diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl index 9f09ce41..6d1582bd 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl @@ -1,6 +1,6 @@ /** * \file - * dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl + * dnn/src/cuda/conv_bias/int8/implicit_gemm_conv_bias_cutlass_wrapper.cuinl * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 87672047..3287b80a 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -181,6 +181,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float alpha = src_scale * filter_scale; float beta = 1.f; float dst_scale = 1.f; + float gamma = 0.f; + float theta = 0.f; + if (args.dst_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + theta = args.dst_layout->dtype.param() + .zero_point; + } if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { megdnn_assert(args.dst_layout->dtype.category() == DTypeCategory::QUANTIZED); @@ -189,7 +195,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( dst_scale = get_scale(args.dst_layout->dtype); alpha /= dst_scale, beta = bias_scale / dst_scale; } - float gamma = 0.f; + float delta = 0.f; if (args.z_layout->ndim > 0) { gamma = 1.f; if (args.z_layout->dtype.category() == DTypeCategory::QUANTIZED) { @@ -198,6 +204,12 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float z_scale = get_scale(args.z_layout->dtype); gamma = z_scale / dst_scale; } + if (args.z_layout->dtype.enumv() == DTypeEnum::Quantized4Asymm) { + uint8_t z_zero = + args.z_layout->dtype.param() + .zero_point; + delta = -z_zero * gamma; + } } uint32_t nonlinear_mode = static_cast(param.nonlineMode); bool nonunity_kernel = !(fh == 1 && fw == 1); @@ -244,14 +256,15 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( DISPATCH(false); #undef cb } else if (param.format == Format::NCHW4_NHWC) { -#define cb(_nonunity_kernel) \ +#define cb(_signedness) \ cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nhwc< \ - _nonunity_kernel>( \ + _signedness>( \ args.src_tensor->compatible_ptr(), filter_ptr, \ args.bias_tensor->compatible_ptr(), \ reinterpret_cast(args.z_tensor->raw_ptr), \ reinterpret_cast(args.dst_tensor->raw_ptr), nullptr, \ - kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, \ + kern_param, nonlinear_mode, alpha, beta, gamma, delta, theta, \ + dst_scale, \ cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, \ m_algo_param.threadblock_n, \ m_algo_param.threadblock_k}, \ @@ -259,7 +272,13 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( m_algo_param.warp_n, \ m_algo_param.warp_k}, \ m_algo_param.stage, stream); - cb(true); + if (args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS4) { + cb(true); + } else { + megdnn_assert(args.dst_layout->dtype.enumv() == + DTypeEnum::Quantized4Asymm); + cb(false); + } #undef cb } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu index 42e715a0..4af7d5a5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_s4_ifprop_hswish_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -49,6 +49,7 @@ template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..9b45e053 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..da56c4af --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_128x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu new file mode 100644 index 00000000..fb31c9d5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_16x128x16_16x128x16_1_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..2f861533 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_16x64x8_16x64x8_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..ed4ff282 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x128x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..5a3898bd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x32x32_32x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..0d0c034a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_32x64x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..29a4bc98 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..358017ff --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..f913d6a0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_hswish_s8_64x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..65bf4569 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..35a155c0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..b778d11f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_128x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu new file mode 100644 index 00000000..d25af069 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_16x128x16_16x128x16_1_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..5edf953d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_16x64x8_16x64x8_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..c52e7751 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x128x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..153e6713 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x32x32_32x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..5e3ec931 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_32x64x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..fbbc7b8e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..efbd0b39 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..6bd54281 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_identity_s8_64x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..b417533b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..23bfd1d2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..a0ea06e1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_128x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<128, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu new file mode 100644 index 00000000..bfaab36c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_16x128x16_16x128x16_1_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<16, 128, 16>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 1, + 4, + 8, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..072804b6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_16x64x8_16x64x8_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<16, 64, 8>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 4, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..b5c634d6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x128x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 128, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..9ccac5c9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x32x32_32x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<32, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..a60595a3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_32x64x32_32x64x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<32, 64, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..6cf4781c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x128x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 128, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..affc60c8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x32x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu new file mode 100644 index 00000000..7cb844de --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc.cu @@ -0,0 +1,55 @@ + +#if !MEGDNN_TEGRA_X1 +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/implicit_gemm_conv_bias_cutlass_wrapper.cuinl" + + +// kernel instance "cutlass_simt_u4_ifprop_relu_s8_64x64x32_64x32x32_2_nc4hw4_nhwc" generated by cutlass generator +using Convolution = + typename cutlass::conv::device::Convolution< + int8_t, + cutlass::layout::TensorNCxHWx<4>, + int8_t, + cutlass::layout::TensorCxRSKx<4>, + cutlass::uint4b_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::layout::TensorNHWC, + int32_t, + cutlass::conv::ConvType::kConvolution, + cutlass::arch::OpClassSimt, + cutlass::arch::Sm75, + cutlass::gemm::GemmShape<64, 64, 32>, + cutlass::gemm::GemmShape<64, 32, 32>, + cutlass::gemm::GemmShape<1, 1, 4>, + cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + cutlass::uint4b_t, + 8, + int32_t, + int32_t, + float + >, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, + 2, + 4, + 16, + true, + cutlass::arch::OpMultiplyAddSaturate>; + + +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream, + typename Convolution::ExtraParam extra_param); +#pragma GCC diagnostic pop +#endif diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp index 8789444f..6cce7359 100644 --- a/src/gopt/impl/tensor_reformat.cpp +++ b/src/gopt/impl/tensor_reformat.cpp @@ -3801,7 +3801,6 @@ void FoldingConvBiasDimshufflePass::apply(OptState& opt) const { return false; auto in_dtype = typecvt->input(0)->dtype(), out_dtype = typecvt->output(0)->dtype(); - printf("%s, %s\n", in_dtype.name(), out_dtype.name()); bool is_s82s4 = in_dtype.enumv() == DTypeEnum::QuantizedS8 && (out_dtype.enumv() == DTypeEnum::QuantizedS4 || out_dtype.enumv() == DTypeEnum::Quantized4Asymm); diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index b8598a45..633ee347 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -4159,14 +4159,7 @@ TEST(TestGoptInference, FoldingConvDimshuffle) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4240,14 +4233,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 61) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 61); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4326,14 +4312,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW32NCHW4) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 75) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 75); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4405,14 +4384,7 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) { REQUIRE_GPU(1); auto cn = CompNode::load("gpu0"); cn.activate(); - auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop; - auto sm_ver = prop.major * 10 + prop.minor; - if (sm_ver < 75) { - printf("This testcast ignored due to insufficient cuda cap(got: %d, " - "expected: %d)\n", - sm_ver, 75); - return; - } + REQUIRE_CUDA_COMPUTE_CAPABILITY(7, 5); HostTensorGenerator gen; auto graph = ComputingGraph::make(); @@ -4466,7 +4438,6 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NHWC) { ->writeto_fpath(output_file( "TestGoptInference.FoldingConvDimshuffleNCHW4NHWC.json")); size_t nr_dimshuffle = find_opr_num(y_fuse); - printf("%zu \n", nr_dimshuffle); ASSERT_EQ(3u, find_opr_num(y_fuse)); bool found = false; cg::DepOprIter{[&found](cg::OperatorNodeBase* opr) {