diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index f9dd4c45..e9e056e2 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -25,6 +25,8 @@ using namespace megdnn; using namespace cuda; using namespace cutlass_wrapper; +/* ================= cutlass kernel wrapper for nchw32 layout ================ + */ #if MEGDNN_TEGRA_X1 template void megdnn::cuda::cutlass_wrapper:: @@ -148,6 +150,131 @@ INST(true); INST(false); #undef INST +/* ==== cutlass kernel wrapper for nchw32 layout and nchw4 output ===== */ +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const int32_t* /* d_bias */, const int8_t* /* d_z */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ + using Convolution = cutlass::convolution::device::Convolution< \ + int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ + cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::convolution::ConvType::kConvolution, \ + cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::convolution::threadblock:: \ + ConvolutionNCxHWxThreadblockSwizzle< \ + cutlass::convolution::ConvType::kConvolution>, \ + 2, 16, 16, NeedLoadFromConstMem>; \ + typename Convolution::ConvolutionParameter conv_param{ \ + param.n, param.ci, param.co, param.hi, param.wi, \ + param.fh, param.fw, param.ho, param.wo, param.sh, \ + param.sw, param.ph, param.pw, 1, 1}; \ + return cutlass_convolution_wrapper( \ + d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ + epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(256, 128, 64, 64, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 256, 64, 64, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 64, 64, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 64, 32, 64, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 64, 64, 32, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 64, 32, 32, 64); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 64, 16, 32, 64); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationReluClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationHSwishClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, cudaStream_t stream); +INST(true); +INST(false); +#undef INST + +/* ================ cutlass kernel wrapper for nchw4 layout ================= */ #if MEGDNN_TEGRA_X1 template void megdnn::cuda::cutlass_wrapper:: @@ -275,6 +402,7 @@ INST(true); INST(false); #undef INST +/* ===== cutlass kernel wrapper for nchw4 layout and nchw output ===== */ #if MEGDNN_TEGRA_X1 template void megdnn::cuda::cutlass_wrapper:: @@ -401,4 +529,131 @@ void megdnn::cuda::cutlass_wrapper:: INST(true); INST(false); #undef INST + +/* ====== cutlass kernel wrapper for nchw4 layout and nchw32 output ====== */ +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const int32_t* /* d_bias */, const int8_t* /* d_z */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( + const int8_t* d_src, const int8_t* d_filter, + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, aligned_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ + using Convolution = cutlass::convolution::device::Convolution< \ + int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ + cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ + cutlass::layout::TensorNCxHWx<32>, int32_t, \ + cutlass::layout::TensorNCxHWx<32>, int32_t, \ + cutlass::convolution::ConvType::kConvolution, \ + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::convolution::threadblock:: \ + ConvolutionNCxHWxThreadblockSwizzle< \ + cutlass::convolution::ConvType::kConvolution>, \ + 2, 4, aligned_, NeedLoadFromConstMem>; \ + typename Convolution::ConvolutionParameter conv_param{ \ + param.n, param.ci, param.co, param.hi, param.wi, \ + param.fh, param.fw, param.ho, param.wo, param.sh, \ + param.sw, param.ph, param.pw, 1, 1}; \ + return cutlass_convolution_wrapper( \ + d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ + epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationReluClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = cutlass::epilogue::thread:: + BiasAddLinearCombinationHSwishClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, cudaStream_t stream); +INST(true); +INST(false); +#undef INST + // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh index 2d78e8c3..85fdd29e 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh @@ -41,6 +41,15 @@ void do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32( cudaStream_t stream); template +void do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, + const int8_t* d_z, int8_t* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + cudaStream_t stream); + +template void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, const int8_t* d_z, int8_t* d_dst, int* workspace, @@ -58,6 +67,15 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, cudaStream_t stream); +template +void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( + const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, + const int8_t* d_z, int8_t* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + cudaStream_t stream); + } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp index 80fd2d35..b02e9027 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw32_imma.cpp @@ -35,10 +35,23 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::is_available( if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), param.format)) return false; - if (param.format != Format::NCHW32) + if (param.format != Format::NCHW32 && param.format != Format::NCHW32_NCHW4) return false; - UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 32, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW32) { + co = args.dst_layout->operator[](1) * 32; + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + co = args.dst_layout->operator[](1) * 4; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR // TODO support group conv available &= param.sparse == Sparse::DENSE; // mode must be cross correlation @@ -84,8 +97,21 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 32, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW32) { + co = args.dst_layout->operator[](1) * 32; + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + co = args.dst_layout->operator[](1) * 4; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR auto&& stream = cuda_stream(args.opr->handle()); int8_t* filter_ptr = nullptr; @@ -137,33 +163,79 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec( } uint32_t nonlinear_mode = static_cast(param.nonlineMode); if (fh == 1 && fw == 1) { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< - false>(args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, - kern_param, nonlinear_mode, alpha, beta, gamma, - dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW32) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< + false>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4< + false>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } } else { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32( - args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, kern_param, - nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW32) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32< + true>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4< + true>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + z_dev_ptr, + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } } + after_kernel_launch(); } std::string ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::to_string( @@ -189,8 +261,21 @@ void ConvBiasForwardImpl::AlgoInt8NCHW32IMMAImplicitGemm::exec_preprocess( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW32_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 32, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW32) { + co = args.dst_layout->operator[](1) * 32; + } else { + megdnn_assert(param.format == Format::NCHW32_NCHW4); + co = args.dst_layout->operator[](1) * 4; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR TensorLayout src{{co, ci / 32, fh, fw, 32}, dtype::Int8()}; src.init_contiguous_stride(); TensorLayout dst = src; diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 3eb7bb9f..58c63144 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -208,6 +208,24 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( stream); } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32< + false>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); } } else { if (param.format == Format::NCHW4) { @@ -246,6 +264,24 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( } else { megdnn_assert(param.format == Format::NCHW4_NCHW32); + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32< + true>( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); } } after_kernel_launch(); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..b2c9c462 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu new file mode 100644 index 00000000..e758d0da --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..f707002d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..d1c44bec --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu new file mode 100644 index 00000000..5158b527 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..21c01d36 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..b28f6824 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu new file mode 100644 index 00000000..f106af21 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..7f45ef62 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..130b04b8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu new file mode 100644 index 00000000..42802443 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..0a7d3c99 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..716a540b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu new file mode 100644 index 00000000..35b97bcd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..8ea93a1c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..36819598 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu new file mode 100644 index 00000000..ebeb8291 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..d9b73fa2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu new file mode 100644 index 00000000..0cd9b194 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu new file mode 100644 index 00000000..4a89381a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu new file mode 100644 index 00000000..0011b40b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu new file mode 100644 index 00000000..a60e3b18 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu new file mode 100644 index 00000000..4bee34e8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu new file mode 100644 index 00000000..69ab4e10 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu new file mode 100644 index 00000000..eb4d164e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu new file mode 100644 index 00000000..576e2378 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu new file mode 100644 index 00000000..32917d00 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..367f6012 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu new file mode 100644 index 00000000..e5ba977b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..c832049c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..8a81f587 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu new file mode 100644 index 00000000..e391dd64 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..27309b70 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..026080c8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu new file mode 100644 index 00000000..4deac474 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..b0a42c14 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu new file mode 100644 index 00000000..48036be9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu new file mode 100644 index 00000000..6d1a69d2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu new file mode 100644 index 00000000..91ce847d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu new file mode 100644 index 00000000..6d4a698f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu new file mode 100644 index 00000000..aeee09ac --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu new file mode 100644 index 00000000..d601e01d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu new file mode 100644 index 00000000..fd235e0a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu new file mode 100644 index 00000000..ac180bd4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu new file mode 100644 index 00000000..352535b7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..ccaca6e8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu new file mode 100644 index 00000000..35915b26 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..1c4b308e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..c97b5e91 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu new file mode 100644 index 00000000..8327475a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..1b0ba002 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..cabcf1ae --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu new file mode 100644 index 00000000..2f111fd9 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..b49faba5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu new file mode 100644 index 00000000..e11f92ff --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu new file mode 100644 index 00000000..bf638233 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu new file mode 100644 index 00000000..5ef27dfd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu new file mode 100644 index 00000000..f46311f2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu new file mode 100644 index 00000000..f9b97cd4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu new file mode 100644 index 00000000..3ac0bd36 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu new file mode 100644 index 00000000..c426dddf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu new file mode 100644 index 00000000..9fe447f3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu new file mode 100644 index 00000000..198293e2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu new file mode 100644 index 00000000..1d4a974f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu new file mode 100644 index 00000000..069e96db --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu new file mode 100644 index 00000000..80a947c2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu new file mode 100644 index 00000000..5fe0f19c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu new file mode 100644 index 00000000..ca80dda2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu new file mode 100644 index 00000000..812cdc81 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu new file mode 100644 index 00000000..0b0da908 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu new file mode 100644 index 00000000..89608b5b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu new file mode 100644 index 00000000..80aac1fc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu new file mode 100644 index 00000000..338226f8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu new file mode 100644 index 00000000..bde7fed6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu new file mode 100644 index 00000000..5cc092b6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu new file mode 100644 index 00000000..b49cdc6b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu new file mode 100644 index 00000000..6957270f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu new file mode 100644 index 00000000..28960bc4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu new file mode 100644 index 00000000..99ff27cd --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu new file mode 100644 index 00000000..c913c159 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu new file mode 100644 index 00000000..5144b3bc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu new file mode 100644 index 00000000..ef7e3b9d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu new file mode 100644 index 00000000..adb02359 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu new file mode 100644 index 00000000..b3df1baf --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu new file mode 100644 index 00000000..a0a2a8be --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu new file mode 100644 index 00000000..89645723 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu new file mode 100644 index 00000000..513ff269 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu new file mode 100644 index 00000000..c9e96d49 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu new file mode 100644 index 00000000..20b9cddc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu new file mode 100644 index 00000000..41cd9ae5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu new file mode 100644 index 00000000..3151e3dc --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu new file mode 100644 index 00000000..c20fe3d0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu new file mode 100644 index 00000000..dddbed50 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu new file mode 100644 index 00000000..6028fba1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu new file mode 100644 index 00000000..0ea06e92 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu new file mode 100644 index 00000000..688064da --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; +using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< + int8_t, 4, int32_t, int32_t, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 41592a08..01a3b8d3 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -1232,6 +1232,73 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}}); } +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW32) { + require_compute_capability(6, 1); + using namespace conv_bias; + Checker checker(handle_cuda()); + UniformIntRNG int_rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW4_NCHW32; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f)) + .set_dtype(3, dtype::QuantizedS8(1.9980618f)) + .set_dtype(4, dtype::QuantizedS8(1.9980618f)) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_rng(2, &bias_rng) + .set_rng(3, &int_rng) + .set_param(param); + auto run = [&](const TensorShapeArray& shapes) { + checker.execs({shapes[0], shapes[1], shapes[2], {}, {}}); + }; + + run({{16, 4, 23, 40, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}}); + run({{16, 4, 92, 160, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}}); + run({{16, 4, 46, 80, 4}, {32, 4, 3, 3, 4}, {1, 1, 1, 1, 32}}); +} + +#if CUDA_VERSION >= 10020 +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_NCHW4) { + require_compute_capability(7, 5); + using namespace conv_bias; + Checker checker(handle_cuda()); + UniformIntRNG int_rng{-3, 3}; + UniformIntRNG bias_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW32_NCHW4; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + checker.set_before_exec_callback(conv_bias::ConvBiasAlgoChecker< + ConvBiasForward>( + ConvBias::algo_name( + "INT8_NCHW32_IMMA_IMPLICIT_GEMM_256X128X64_64X64X64", + ConvBias::DirectParam{}) + .c_str())); + checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::QuantizedS32(1.9980618f * 1.9980927f)) + .set_dtype(3, dtype::QuantizedS8(1.9980618f)) + .set_dtype(4, dtype::QuantizedS8(1.9980618f)) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_rng(2, &bias_rng) + .set_rng(3, &int_rng) + .set_param(param); + auto run = [&](const TensorShapeArray& shapes) { + checker.execs({shapes[0], shapes[1], shapes[2], {}, {}}); + }; + + run({{16, 2, 23, 40, 32}, {20, 2, 3, 3, 32}, {1, 5, 1, 1, 4}}); + run({{16, 1, 92, 160, 32}, {24, 1, 3, 3, 32}, {1, 6, 1, 1, 4}}); + run({{16, 2, 46, 80, 32}, {4, 2, 3, 3, 32}, {1, 1, 1, 1, 4}}); +} +#endif + #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) { require_compute_capability(6, 1);