diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index fd840927..f9dd4c45 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -275,4 +275,130 @@ INST(true); INST(false); #undef INST +#if MEGDNN_TEGRA_X1 +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + const float* /* d_bias */, const float* /* d_z */, + float* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, + uint32_t /* nonlinear_mode */, float /* alpha */, + float /* beta */, float /* gamma */, float /* scale */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} +#else +template +void megdnn::cuda::cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + const int8_t* d_src, const int8_t* d_filter, + const float* d_bias, const float* d_z, float* d_dst, + int* workspace, const convolution::ConvParam& param, + uint32_t nonlinear_mode, float alpha, float beta, float gamma, + float scale, const GemmCoord& threadblock_shape, + const GemmCoord& warp_shape, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, aligned_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ + using Convolution = cutlass::convolution::device::Convolution< \ + int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ + cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ + cutlass::layout::TensorNCHW, float, \ + cutlass::layout::TensorNCHW, int32_t, \ + cutlass::convolution::ConvType::kConvolution, \ + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::convolution::threadblock:: \ + ConvolutionNCxHWxThreadblockSwizzle< \ + cutlass::convolution::ConvType::kConvolution>, \ + 2, 4, aligned_, NeedLoadFromConstMem, \ + cutlass::arch::OpMultiplyAdd>; \ + typename Convolution::ConvolutionParameter conv_param{ \ + param.n, param.ci, param.co, param.hi, param.wi, \ + param.fh, param.fw, param.ho, param.wo, param.sh, \ + param.sw, param.ph, param.pw, 1, 1}; \ + return cutlass_convolution_wrapper( \ + d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ + epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = float; + using ElementAccumulator = int32_t; + using ElementBias = float; + using ElementCompute = float; + using NonlineMode = megdnn::param_enumv::ConvBias::NonlineMode; + switch (nonlinear_mode) { + case NonlineMode::IDENTITY: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombination< + ElementOutput, 1, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma}; + DISPATCH_KERNEL; + } + case NonlineMode::RELU: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + ElementOutput, 1, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, 0}; + DISPATCH_KERNEL; + } + case NonlineMode::H_SWISH: { + using EpilogueOp = + cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + ElementOutput, 1, ElementAccumulator, ElementBias, + ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, beta, gamma, scale}; + DISPATCH_KERNEL; + } + default: + megdnn_assert(false, + "unsupported nonlinear mode for conv bias operator"); + } +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +#define INST(need_load_from_const_mem) \ + template void megdnn::cuda::cutlass_wrapper:: \ + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw< \ + need_load_from_const_mem>( \ + const int8_t* d_src, const int8_t* d_filter, \ + const float* d_bias, const float* d_z, float* d_dst, \ + int* workspace, const convolution::ConvParam& param, \ + uint32_t nonlinear_mode, float alpha, float beta, \ + float gamma, float scale, \ + const GemmCoord& threadblock_shape, \ + const GemmCoord& warp_shape, cudaStream_t stream); +INST(true); +INST(false); +#undef INST // vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh index 172ed5d7..2d78e8c3 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh @@ -22,8 +22,11 @@ using GemmCoord = cutlass::gemm::GemmCoord; template void cutlass_convolution_wrapper( - const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, - const int8_t* d_z, int8_t* d_dst, int* workspace, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, cudaStream_t stream); @@ -46,6 +49,15 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, cudaStream_t stream); +template +void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + const int8_t* d_src, const int8_t* d_filter, const float* d_bias, + const float* d_z, float* d_dst, int* workspace, + const convolution::ConvParam& param, uint32_t nonlinear_mode, + float alpha, float beta, float gamma, float scale, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + cudaStream_t stream); + } // namespace cutlass_wrapper } // namespace cuda } // namespace megdnn diff --git a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp index 22451bf3..3eb7bb9f 100644 --- a/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp +++ b/dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp @@ -32,10 +32,26 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), param.format)) return false; - if (param.format != Format::NCHW4) + if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW && + param.format != Format::NCHW4_NCHW32) return false; - UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 4, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW4) { + co = args.dst_layout->operator[](1) * 4; + } else if (param.format == Format::NCHW4_NCHW) { + co = args.dst_layout->operator[](1); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + co = args.dst_layout->operator[](1) * 32; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR // TODO support group conv available &= param.sparse == Sparse::DENSE; // mode must be cross correlation @@ -46,9 +62,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( bias_dtype = args.bias_layout->dtype, dst_dtype = args.dst_layout->dtype; available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && - filter_dtype.enumv() == DTypeEnum::QuantizedS8 && - bias_dtype.enumv() == DTypeEnum::QuantizedS32 && - dst_dtype.enumv() == DTypeEnum::QuantizedS8); + filter_dtype.enumv() == DTypeEnum::QuantizedS8); + available &= (bias_dtype.enumv() == DTypeEnum::QuantizedS32 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8) || + (bias_dtype.enumv() == DTypeEnum::Float32 && + dst_dtype.enumv() == DTypeEnum::Float32); // TODO: support dialtion available &= dh == 1 && dw == 1; // only support sm_61 or later, platform should have fast native int8 @@ -81,8 +99,23 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 4, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW4) { + co = args.dst_layout->operator[](1) * 4; + } else if (param.format == Format::NCHW4_NCHW) { + co = args.dst_layout->operator[](1); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + co = args.dst_layout->operator[](1) * 32; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR auto&& stream = cuda_stream(args.opr->handle()); int8_t* filter_ptr = nullptr; @@ -115,47 +148,107 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( float src_scale = args.src_layout->dtype.param().scale, filter_scale = - args.filter_layout->dtype.param().scale, - bias_scale = - args.bias_layout->dtype.param().scale, - dst_scale = args.dst_layout->dtype.param().scale; - float alpha = src_scale * filter_scale / dst_scale, - beta = bias_scale / dst_scale; - int8_t* z_dev_ptr = nullptr; - float gamma = 0.0; + args.filter_layout->dtype.param().scale; + float alpha = src_scale * filter_scale; + float beta = 1.f; + float dst_scale = 1.f; + if (args.bias_layout->dtype.enumv() == DTypeEnum::QuantizedS32) { + megdnn_assert(args.dst_layout->dtype.enumv() == DTypeEnum::QuantizedS8); + float bias_scale = args.bias_layout->dtype.param() + .scale, + dst_scale = + args.dst_layout->dtype.param().scale; + alpha /= dst_scale, beta = bias_scale / dst_scale; + } + float gamma = 0.f; if (args.z_layout->ndim > 0) { - z_dev_ptr = args.z_tensor->compatible_ptr(); - float z_scale = args.z_layout->dtype.param().scale; - gamma = z_scale / dst_scale; + gamma = 1.f; + if (args.z_layout->dtype.enumv() == DTypeEnum::QuantizedS8) { + megdnn_assert(args.dst_layout->dtype.enumv() == + DTypeEnum::QuantizedS8); + float z_scale = args.z_layout->dtype.param() + .scale; + gamma = z_scale / dst_scale; + } } uint32_t nonlinear_mode = static_cast(param.nonlineMode); if (fh == 1 && fw == 1) { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( - args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, kern_param, - nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW4) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4< + false>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else if (param.format == Format::NCHW4_NCHW) { + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + } } else { - cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( - args.src_tensor->compatible_ptr(), filter_ptr, - args.bias_tensor->compatible_ptr(), z_dev_ptr, - args.dst_tensor->compatible_ptr(), nullptr, kern_param, - nonlinear_mode, alpha, beta, gamma, dst_scale, - cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, - m_algo_param.threadblock_n, - m_algo_param.threadblock_k}, - cutlass_wrapper::GemmCoord{m_algo_param.warp_m, - m_algo_param.warp_n, - m_algo_param.warp_k}, - stream); + if (param.format == Format::NCHW4) { + cutlass_wrapper::do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4< + true>( + args.src_tensor->compatible_ptr(), filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, dst_scale, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + } else if (param.format == Format::NCHW4_NCHW) { + cutlass_wrapper:: + do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( + args.src_tensor->compatible_ptr(), + filter_ptr, + args.bias_tensor->compatible_ptr(), + args.z_tensor->compatible_ptr(), + args.dst_tensor->compatible_ptr(), nullptr, + kern_param, nonlinear_mode, alpha, beta, gamma, + dst_scale, + cutlass_wrapper::GemmCoord{ + m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, + m_algo_param.warp_n, + m_algo_param.warp_k}, + stream); + + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + } } + after_kernel_launch(); } size_t ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm:: @@ -174,8 +267,23 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec_preprocess( using Format = Param::Format; auto&& param = args.opr->param(); auto&& fm = args.filter_meta; - UNPACK_CONV_BIAS_NCHW4_PARAM(*(args.src_layout), fm, *(args.dst_layout), - param); + size_t n = args.src_layout->operator[](0), + ci = args.src_layout->operator[](1) * 4, + hi = args.src_layout->operator[](2), + wi = args.src_layout->operator[](3); + size_t ho = args.dst_layout->operator[](2), + wo = args.dst_layout->operator[](3); + size_t co; + if (param.format == Format::NCHW4) { + co = args.dst_layout->operator[](1) * 4; + } else if (param.format == Format::NCHW4_NCHW) { + co = args.dst_layout->operator[](1); + } else { + megdnn_assert(param.format == Format::NCHW4_NCHW32); + co = args.dst_layout->operator[](1) * 32; + } + UNPACK_CONV_PARAMETER(fm, param); + MARK_USED_VAR TensorLayout src{{co, ci / 4 * fh * fw}, dtype::Int32()}; src.init_contiguous_stride(); TensorLayout dst = src; diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl index 785b7978..256742a1 100644 --- a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl @@ -19,25 +19,28 @@ using namespace cutlass_wrapper; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, const int8_t* d_filter, const int32_t* d_bias, - const int8_t* d_z, int8_t* d_dst, int* workspace, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, cudaStream_t stream) { typename Convolution::TensorRefSrc tensor_src{ - const_cast(d_src), + const_cast(d_src), Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), conv_param.wi(), conv_param.ci()})}; typename Convolution::TensorRefFilter tensor_filter{ - const_cast(d_filter), + const_cast(d_filter), Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), conv_param.fw(), conv_param.ci()})}; typename Convolution::TensorRefBias tensor_bias{ - const_cast(d_bias), + const_cast(d_bias), Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; typename Convolution::TensorRefDst tensor_z{ - const_cast(d_z), + const_cast(d_z), Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), conv_param.wo(), conv_param.co()})}; typename Convolution::TensorRefDst tensor_dst{ diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu index 61802649..4f0eebb7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu index b32dc890..b92f7681 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu index 9f0eb40e..e47ab052 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu index a3a08d3c..30eec136 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu index 636ec223..1307f937 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu index f16fe480..1e095d92 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu index cb4167c5..dbf1e900 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu index 9795f15d..1ef223f9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu index 06e8522d..65a5fcc7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu index 863cffca..931f3716 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 4, true>; + 2, 4, 4, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu index 205314be..70949c65 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 4, true>; + 2, 4, 4, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu index e0021192..0aaa3f7a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 4, true>; + 2, 4, 4, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu index 34a0ceb0..dc198d34 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu index 3a6dd8e6..e20f26dc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu index c3fb2fac..f9e8b712 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu index 08312f67..ade39f7a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu index 62b43ebf..e07c6dcf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu index 0fb9e330..27aa11b5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu index cf2a74ae..3e38529b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu index 7b7f9aa8..1b3145c4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu index e48c4f01..962dd2ff 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu index ee784998..4d3feb12 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 4, false>; + 2, 4, 4, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu index 878d9843..1dbfbf60 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 4, false>; + 2, 4, 4, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu index 851b5bfd..d4fcfb06 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 4, false>; + 2, 4, 4, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu index ecbb616e..ab979349 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu index fa14228a..4d9abe29 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu index e07d406c..b1efef36 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu index 4daf64bf..fb6b71da 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu index 58ae9dbd..c38538c3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu index d36b4d31..b91efded 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu index c122b165..c877492e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu index aee4d687..40e6cb5d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu index 65c59600..19f337a1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu index f83c30bf..27f7d5b8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu index 300018c5..1a8e17fd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu index ff10d17e..1a4e679f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu index 298325d4..252a8f77 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu index 7c1548d5..f5d01d75 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu index bff2c6a5..a29be07e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu index 5bd5e833..07333503 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu index 9d4d2759..41f6db05 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu index c2ec29a2..34896baf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, false>; + 2, 4, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu index e4530b41..5f247f0c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu index aa3da4ee..81cdd18c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu index edc33c23..0bce8bf5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu index d480d5c4..1ffdb3c4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu index db532d61..bf0fdcff 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu index f75b28d8..bb3a17ba 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu index 18e8bf12..9df0b4f4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu index 7a554763..c0da1d27 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu index d0ed15c7..309aa239 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu index a97080ac..210c52e0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu index 62fb28f9..4f4f35f8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu index 608d5af4..57e4e210 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu index bf0add0c..224e8a63 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu index cfe256ca..55761010 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu index e893e101..170f1971 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu index fa81c504..03cbfbc5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu index 71b62fb2..4698d99f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu index 2d661c10..38358f7e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 4, 16, true>; + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..35736675 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu new file mode 100644 index 00000000..c447d319 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..898466e1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..83dd4204 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu new file mode 100644 index 00000000..5dc680a7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..bbf92f63 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..faa8ea18 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu new file mode 100644 index 00000000..7e4273a7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..7fa35c72 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu new file mode 100644 index 00000000..ec78da97 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 4, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu new file mode 100644 index 00000000..27df4ce8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 4, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu new file mode 100644 index 00000000..b08e9563 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 4, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..88d5dad6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu new file mode 100644 index 00000000..021eee46 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..540f13ec --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..d9fa27c5 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu new file mode 100644 index 00000000..bd9b9db7 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..3581ac6b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..2a13f48c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu new file mode 100644 index 00000000..33a1b07f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..2e86a099 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu new file mode 100644 index 00000000..56c22fc6 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 4, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu new file mode 100644 index 00000000..68a8a13b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 4, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu new file mode 100644 index 00000000..8ca682f4 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 4, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu new file mode 100644 index 00000000..b6923f4a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu new file mode 100644 index 00000000..75acf999 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu new file mode 100644 index 00000000..7248da5a --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu new file mode 100644 index 00000000..b8c1e278 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu new file mode 100644 index 00000000..393b5a6c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu new file mode 100644 index 00000000..c1ae6410 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu new file mode 100644 index 00000000..2d327f97 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu new file mode 100644 index 00000000..33332c77 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu new file mode 100644 index 00000000..37369d3e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..5ffd599e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu new file mode 100644 index 00000000..2fea2ac0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..883b755d --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..3cfe915f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu new file mode 100644 index 00000000..6b711dc1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..c6340646 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..e1a4fac1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu new file mode 100644 index 00000000..8b2c0d5f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..eeb3c5c2 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, false, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu new file mode 100644 index 00000000..b04889b8 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu new file mode 100644 index 00000000..03d817af --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu new file mode 100644 index 00000000..86009742 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu new file mode 100644 index 00000000..22a9bfb1 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu new file mode 100644 index 00000000..e1e20011 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu new file mode 100644 index 00000000..d9b963e0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu new file mode 100644 index 00000000..4ad86a3f --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu new file mode 100644 index 00000000..6f7648c3 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu new file mode 100644 index 00000000..f3f8802b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu new file mode 100644 index 00000000..497591db --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu new file mode 100644 index 00000000..b06de5af --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu new file mode 100644 index 00000000..28aa9131 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu new file mode 100644 index 00000000..87e5aae0 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu new file mode 100644 index 00000000..0f5d5d87 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu new file mode 100644 index 00000000..daadee35 --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 32, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu new file mode 100644 index 00000000..6f19370e --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu new file mode 100644 index 00000000..ca6f680b --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu new file mode 100644 index 00000000..b7eccd8c --- /dev/null +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu @@ -0,0 +1,37 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; +using LayoutDst = cutlass::layout::TensorNCHW; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< + float, 1, int32_t, float, float>; +using Convolution = cutlass::convolution::device::Convolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, float, + LayoutDst, float, LayoutDst, int32_t, + cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< + cutlass::convolution::ConvType::kConvolution>, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAdd>; +template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, + int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu index de60503a..42029b8a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu index d983bd9f..9de29a4a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu index f18a6ddb..a91bd76d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu index 6326c4c3..a156009f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu index 03827f1f..645fb0f6 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu index 2eb13965..b6451a72 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu index 6eaaa06c..0d5dc8c4 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu index 622b4076..dd1e9c2e 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu index 138b2448..96f54e5d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu index b15fb04b..cdef6ce7 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu index a6c7e14d..b576c796 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu index ec643d8f..7d6d5092 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu index f9e4e4ee..b504399a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu index 38cdbac7..e1f9eaa2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu index 110aad93..03f90194 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 256, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu index 7cc97c69..006f790a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu index da6d857c..0906068e 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu index 7f8ef9cf..adcbfbea 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<128, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu index adbb4fee..31bac99f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu index 04e2a5b9..d35a2be8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu index 85335275..272c7249 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu index a52c6a23..7315cecb 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu index b017b288..9d4b8466 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu index fb982a32..6a87dc00 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu index 1599ee01..f234109f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu index 25e96942..8846c82f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu index 5fbd49e1..54a30792 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu index 3aab172c..df3571a2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu index 8db95857..deba71bb 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu index ccf7790f..2392dc7b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, false>; + 2, 16, 16, false, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu index 6d37a375..fe0b5849 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu index 1009ca3b..61ac1b49 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu index 445b6529..4ea0b892 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<256, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu index 0299ac32..ffef0da2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu index 9cc576d2..aa544e2d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu index 7e528493..762bf3e9 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<32, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu index ec9f4d36..bad9a5ce 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu index e3d3e9eb..b08d7a95 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu index 6d47926c..9173c904 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu index deea9c4c..ceb38356 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu index 6bb39d14..7f1b55c0 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu index d7ce6666..c599f7ac 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu @@ -8,6 +8,7 @@ using LayoutSrc = cutlass::layout::TensorNCxHWx<32>; using LayoutFilter = cutlass::layout::TensorCxRSKx<32>; +using LayoutDst = cutlass::layout::TensorNCxHWx<32>; using ThreadBlockShape = cutlass::gemm::GemmShape<64, 64, 64>; using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; @@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; using Convolution = cutlass::convolution::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, - LayoutSrc, int32_t, LayoutSrc, int32_t, + LayoutDst, int32_t, LayoutDst, int32_t, cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< cutlass::convolution::ConvType::kConvolution>, - 2, 16, 16, true>; + 2, 16, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( - const int8_t* d_src, - const int8_t* d_filter, - const int32_t* d_bias, - const int8_t* d_z, - int8_t* d_dst, + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, typename Convolution::ConvolutionParameter const& conv_param, typename Convolution::EpilogueOutputOp::Params const& epilogue, diff --git a/dnn/test/cuda/conv_bias_int8.cpp b/dnn/test/cuda/conv_bias_int8.cpp index 693da6ca..41592a08 100644 --- a/dnn/test/cuda/conv_bias_int8.cpp +++ b/dnn/test/cuda/conv_bias_int8.cpp @@ -1191,6 +1191,47 @@ TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW32_IMMA) { } #endif +TEST_F(CUDA, CUTLASS_CONV_BIAS_INT8_NCHW4_NCHW) { + require_compute_capability(6, 1); + using namespace conv_bias; + Checker checker(handle_cuda()); + UniformIntRNG int_rng{-3, 3}; + UniformFloatRNG float_rng{-50, 50}; + ConvBias::Param param; + param.format = ConvBias::Param::Format::NCHW4_NCHW; + param.nonlineMode = ConvBias::Param::NonlineMode::IDENTITY; + checker.set_before_exec_callback( + conv_bias::ConvBiasAlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + checker.set_dtype(0, dtype::QuantizedS8(1.9980618f)) + .set_dtype(1, dtype::QuantizedS8(1.9980927f)) + .set_dtype(2, dtype::Float32()) + .set_dtype(3, dtype::Float32()) + .set_dtype(4, dtype::Float32()) + .set_rng(0, &int_rng) + .set_rng(1, &int_rng) + .set_rng(2, &float_rng) + .set_rng(3, &float_rng) + .set_param(param); + + auto opr = handle_cuda()->create_operator(); + + auto run = [&](const TensorShapeArray& shapes) { + opr->param() = param; + TensorLayout dst_layout; + opr->deduce_layout({shapes[0], dtype::Float32()}, + {shapes[1], dtype::Float32()}, {}, {}, dst_layout); + checker.execs({shapes[0], shapes[1], shapes[2], dst_layout, {}}); + }; + + run({{16, 4, 23, 40, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}}); + run({{16, 4, 92, 160, 4}, {24, 4, 3, 3, 4}, {1, 24, 1, 1}}); + run({{16, 4, 92, 160, 4}, {20, 4, 3, 3, 4}, {1, 20, 1, 1}}); + run({{16, 4, 92, 160, 4}, {16, 4, 3, 3, 4}, {1, 16, 1, 1}}); + run({{16, 4, 92, 160, 4}, {8, 4, 3, 3, 4}, {1, 8, 1, 1}}); + run({{16, 4, 46, 80, 4}, {4, 4, 3, 3, 4}, {1, 4, 1, 1}}); +} + #if MEGDNN_WITH_BENCHMARK TEST_F(CUDA, BENCHMARK_CONV_BIAS_INT8_CHWN4) { require_compute_capability(6, 1);