diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 46a65af5..667b5771 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -286,7 +286,8 @@ void megdnn::cuda::cutlass_wrapper:: uint32_t /* nonlinear_mode */, float /* alpha */, float /* beta */, float /* gamma */, float /* scale */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} + const GemmCoord& /* warp_shape */, int /* stages */, + cudaStream_t /* stream */) {} #else template void megdnn::cuda::cutlass_wrapper:: @@ -296,15 +297,15 @@ void megdnn::cuda::cutlass_wrapper:: int* workspace, const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float scale, const GemmCoord& threadblock_shape, - const GemmCoord& warp_shape, cudaStream_t stream) { + const GemmCoord& warp_shape, int stages, cudaStream_t stream) { #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ threadblock_k_, warp_m_, warp_n_, \ - warp_k_, stage_, aligned_) \ + warp_k_, stage_, aligned_) \ if (threadblock_shape.m() == threadblock_m_ && \ threadblock_shape.n() == threadblock_n_ && \ threadblock_shape.k() == threadblock_k_ && \ warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ - warp_shape.k() == warp_k_) { \ + warp_shape.k() == warp_k_ && stages == stage_) { \ using ThreadBlockShape = \ cutlass::gemm::GemmShape; \ @@ -397,7 +398,8 @@ void megdnn::cuda::cutlass_wrapper:: uint32_t nonlinear_mode, float alpha, float beta, \ float gamma, float scale, \ const GemmCoord& threadblock_shape, \ - const GemmCoord& warp_shape, cudaStream_t stream); + const GemmCoord& warp_shape, int stages, \ + cudaStream_t stream); INST(true); INST(false); #undef INST @@ -414,7 +416,8 @@ void megdnn::cuda::cutlass_wrapper:: uint32_t /* nonlinear_mode */, float /* alpha */, float /* beta */, float /* gamma */, float /* scale */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} + const GemmCoord& /* warp_shape */, int /* stages */, + cudaStream_t /* stream */) {} #else template void megdnn::cuda::cutlass_wrapper:: @@ -424,15 +427,15 @@ void megdnn::cuda::cutlass_wrapper:: int* workspace, const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float scale, const GemmCoord& threadblock_shape, - const GemmCoord& warp_shape, cudaStream_t stream) { + const GemmCoord& warp_shape, int stages, cudaStream_t stream) { #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ threadblock_k_, warp_m_, warp_n_, \ - warp_k_, aligned_) \ + warp_k_, stages_, aligned_) \ if (threadblock_shape.m() == threadblock_m_ && \ threadblock_shape.n() == threadblock_n_ && \ threadblock_shape.k() == threadblock_k_ && \ warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ - warp_shape.k() == warp_k_) { \ + warp_shape.k() == warp_k_ && stages == stages_) { \ using ThreadBlockShape = \ cutlass::gemm::GemmShape; \ @@ -449,7 +452,7 @@ void megdnn::cuda::cutlass_wrapper:: cutlass::convolution::threadblock:: \ ConvolutionNCxHWxThreadblockSwizzle< \ cutlass::convolution::ConvType::kConvolution>, \ - 2, 4, aligned_, NeedLoadFromConstMem, \ + stages_, 4, aligned_, NeedLoadFromConstMem, \ cutlass::arch::OpMultiplyAdd>; \ typename Convolution::ConvolutionParameter conv_param{ \ param.n, param.ci, param.co, param.hi, param.wi, \ @@ -460,16 +463,17 @@ void megdnn::cuda::cutlass_wrapper:: epilogue, stream); \ } #define DISPATCH_KERNEL \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \ megdnn_assert(false, \ "unsupported threadblock shape (%dx%dx%d) and warp shape " \ "(%dx%dx%d)", \ @@ -525,7 +529,8 @@ void megdnn::cuda::cutlass_wrapper:: uint32_t nonlinear_mode, float alpha, float beta, \ float gamma, float scale, \ const GemmCoord& threadblock_shape, \ - const GemmCoord& warp_shape, cudaStream_t stream); + const GemmCoord& warp_shape, int stages, \ + cudaStream_t stream); INST(true); INST(false); #undef INST @@ -542,7 +547,8 @@ void megdnn::cuda::cutlass_wrapper:: uint32_t /* nonlinear_mode */, float /* alpha */, float /* beta */, float /* gamma */, float /* scale */, const GemmCoord& /* threadblock_shape */, - const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} + const GemmCoord& /* warp_shape */, int /* stages */, + cudaStream_t /* stream */) {} #else template void megdnn::cuda::cutlass_wrapper:: @@ -552,15 +558,15 @@ void megdnn::cuda::cutlass_wrapper:: int* workspace, const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float scale, const GemmCoord& threadblock_shape, - const GemmCoord& warp_shape, cudaStream_t stream) { + const GemmCoord& warp_shape, int stages, cudaStream_t stream) { #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ threadblock_k_, warp_m_, warp_n_, \ - warp_k_, aligned_) \ + warp_k_, stages_, aligned_) \ if (threadblock_shape.m() == threadblock_m_ && \ threadblock_shape.n() == threadblock_n_ && \ threadblock_shape.k() == threadblock_k_ && \ warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ - warp_shape.k() == warp_k_) { \ + warp_shape.k() == warp_k_ && stages == stages_) { \ using ThreadBlockShape = \ cutlass::gemm::GemmShape; \ @@ -577,7 +583,7 @@ void megdnn::cuda::cutlass_wrapper:: cutlass::convolution::threadblock:: \ ConvolutionNCxHWxThreadblockSwizzle< \ cutlass::convolution::ConvType::kConvolution>, \ - 2, 4, aligned_, NeedLoadFromConstMem>; \ + stages_, 4, aligned_, NeedLoadFromConstMem>; \ typename Convolution::ConvolutionParameter conv_param{ \ param.n, param.ci, param.co, param.hi, param.wi, \ param.fh, param.fw, param.ho, param.wo, param.sh, \ @@ -587,15 +593,15 @@ void megdnn::cuda::cutlass_wrapper:: epilogue, stream); \ } #define DISPATCH_KERNEL \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ megdnn_assert(false, \ "unsupported threadblock shape (%dx%dx%d) and warp shape " \ "(%dx%dx%d)", \ @@ -651,7 +657,8 @@ void megdnn::cuda::cutlass_wrapper:: uint32_t nonlinear_mode, float alpha, float beta, \ float gamma, float scale, \ const GemmCoord& threadblock_shape, \ - const GemmCoord& warp_shape, cudaStream_t stream); + const GemmCoord& warp_shape, int stages, \ + cudaStream_t stream); INST(true); INST(false); #undef INST diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh index 71c15856..0a9511d2 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh @@ -56,7 +56,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float scale, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream); + int stages, cudaStream_t stream); template void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( @@ -65,7 +65,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float scale, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream); + int stages, cudaStream_t stream); template void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( @@ -74,7 +74,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( const convolution::ConvParam& param, uint32_t nonlinear_mode, float alpha, float beta, float gamma, float scale, const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, - cudaStream_t stream); + int stages, cudaStream_t stream); } // namespace cutlass_wrapper } // namespace cuda diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 76cec7d2..ace4620a 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -32,8 +32,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), param.format)) return false; - if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW && - param.format != Format::NCHW4_NCHW32) + if (param.format == Format::NCHW4_NCHW32) { + if (m_algo_param.threadblock_m % 32 != 0) + return false; + } else if (param.format != Format::NCHW4_NCHW && + param.format != Format::NCHW4) return false; size_t n = args.src_layout->operator[](0), ci = args.src_layout->operator[](1) * 4, @@ -187,7 +190,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}, - stream); + m_algo_param.stage, stream); } else if (param.format == Format::NCHW4_NCHW) { cutlass_wrapper:: do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( @@ -205,7 +208,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}, - stream); + m_algo_param.stage, stream); } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); cutlass_wrapper:: @@ -225,7 +228,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}, - stream); + m_algo_param.stage, stream); } } else { if (param.format == Format::NCHW4) { @@ -242,7 +245,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}, - stream); + m_algo_param.stage, stream); } else if (param.format == Format::NCHW4_NCHW) { cutlass_wrapper:: do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( @@ -260,7 +263,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}, - stream); + m_algo_param.stage, stream); } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); @@ -281,7 +284,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, m_algo_param.warp_k}, - stream); + m_algo_param.stage, stream); } } after_kernel_launch(); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu index ab01f989..0c779fe1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 1, 4, 8, true>; + 1, 4, 8, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu index 9f901437..85fdecd7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 1, 4, 8, true>; + 1, 4, 8, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu index 5dfd371c..c088e6f6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 1, 4, 8, true>; + 1, 4, 8, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu index 1c7115e9..14226ecd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 1, 4, 8, false>; + 1, 4, 8, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu index 374f51e9..449dee42 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 1, 4, 8, false>; + 1, 4, 8, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu index 76ecaad0..0ed74669 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 1, 4, 8, false>; + 1, 4, 8, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu new file mode 100644 index 00000000..8865080e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 1, 4, 8, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu new file mode 100644 index 00000000..6a93ab61 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 1, 4, 8, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu new file mode 100644 index 00000000..6e01aaaf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 1, 4, 8, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu new file mode 100644 index 00000000..ddea6211 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 1, 4, 8, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu new file mode 100644 index 00000000..0aafd7fe --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 1, 4, 8, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu new file mode 100644 index 00000000..0809ddee --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 1, 4, 8, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp index f6e0f540..c6e040b2 100644 --- a/src/gopt/test/inference.cpp +++ b/src/gopt/test/inference.cpp @@ -3895,6 +3895,9 @@ TEST(TestGoptInference, FoldingConvDimshuffle) { .apply({{y}}) .endpoint_vars(), y_fuse); + gopt::modify_opr_algo_strategy_inplace( + {y_fuse}, + opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE); graph->compile({{y_fuse, {}}}) ->to_json() ->writeto_fpath(output_file( @@ -3976,6 +3979,9 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { .apply({{y}}) .endpoint_vars(), y_fuse); + gopt::modify_opr_algo_strategy_inplace( + {y_fuse}, + opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE); graph->compile({{y_fuse, {}}}) ->to_json() ->writeto_fpath(output_file(