diff --git a/.gitattributes b/.gitattributes index be56319f..07945c65 100644 --- a/.gitattributes +++ b/.gitattributes @@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary dnn/src/cuda/sass/prebuilt/map_defs.cpp binary +dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text *.caffemodel filter=lfs diff=lfs merge=lfs -text imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text diff --git a/dnn/src/common/convolution.cpp b/dnn/src/common/convolution.cpp index e2c7aa91..b23f4f83 100644 --- a/dnn/src/common/convolution.cpp +++ b/dnn/src/common/convolution.cpp @@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc( size_t src_ndim, const TensorLayout& filter, const Param& param, typename ConvolutionBase::CanonizedFilterMeta& ret) { megdnn_assert(param.format == Param::Format::NCHW || - param.format == Param::Format::NHWC ); + param.format == Param::Format::NHWC); auto img_ndim = src_ndim - 2; size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; if (param.sparse == Param::Sparse::DENSE) { @@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx( img_ndim, filter.ndim); megdnn_assert((filter[filter.ndim - 1] == pack_size && filter[filter.ndim - 2] == pack_size) || - (filter[filter.ndim - 1] == 2 * pack_size && - filter[filter.ndim - 2] == 2 * pack_size), + (filter[filter.ndim - 1] == 2 * pack_size && + filter[filter.ndim - 2] == 2 * pack_size), "last 2 dim of filter must be %zu, but got %zu, %zu", pack_size, filter[filter.ndim - 2], filter[filter.ndim - 1]); @@ -684,7 +684,8 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, } if (param().format == Param::Format::NCHW44 || param().format == Param::Format::NCHW44_DOT) { - //!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul + //! support nchw44 filter change to 88 for int8 winogradf23_88 using + //! MK8 mamtul megdnn_assert((src.ndim == 4 && filter.ndim == 5 && filter[filter.ndim - 1] == 4) || (src.ndim == 5 && @@ -716,7 +717,7 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, "currently only convolution on 2D image is supported"); auto cflt = make_canonized_filter_meta(src.ndim, filter); if (param().format == Param::Format::NCHW || - param().format == Param::Format::NHWC ) { + param().format == Param::Format::NHWC) { size_t src_or_dst_c_pos = 0; size_t src_or_dst_spatial_start = 0; if (param().format == Param::Format::NCHW) { @@ -790,7 +791,7 @@ ConvolutionBase::deduce_layout_fwd(const TensorLayout& src, dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], cflt.stride[1], cflt.padding[1]); dst[4] = 32; - } else if (param().format == Param::Format::NCHW88 ) { + } else if (param().format == Param::Format::NCHW88) { megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", src.ndim); @@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, } megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 #if !MEGDNN_DISABLE_FLOAT16 - || filter.enumv() == DTypeEnum::Float16 - || filter.enumv() == DTypeEnum::BFloat16 + || filter.enumv() == DTypeEnum::Float16 || + filter.enumv() == DTypeEnum::BFloat16 #endif - , + , "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " "input / output."); } @@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], cflt.stride[i], cflt.padding[i]); } + } else if (param().format == Param::Format::NCHW4) { + megdnn_assert(diff.ndim == 5, + "valid diff ndim for NCHW4, expected=5, got=%zu", + diff.ndim); + megdnn_assert(cflt.group == 1, "%s", errmsg().c_str()); + megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s", + errmsg().c_str()); + grad.ndim = diff.ndim; + grad[0] = diff[0]; + auto ic = cflt.icpg * cflt.group; + megdnn_assert(ic % 4 == 0); + grad[1] = ic / 4; + grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0], + cflt.padding[0]); + grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], + cflt.padding[1]); + megdnn_assert(diff[4] == 4); + grad[4] = 4; } else { megdnn_assert(param().format == Param::Format::NHWCD4); megdnn_assert(diff.ndim == 5, diff --git a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu index 667b5771..61921a21 100644 --- a/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu +++ b/dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu @@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ 2, 16, 16, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ 2, 16, 16, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ cutlass::layout::TensorNCxHWx<4>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ stage_, 4, aligned_, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ cutlass::layout::TensorNCHW, float, \ cutlass::layout::TensorNCHW, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ stages_, 4, aligned_, NeedLoadFromConstMem, \ cutlass::arch::OpMultiplyAdd>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ @@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper:: threadblock_k_>; \ using WarpShape = cutlass::gemm::GemmShape; \ using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ - using Convolution = cutlass::convolution::device::Convolution< \ + using Convolution = cutlass::conv::device::Convolution< \ int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ cutlass::layout::TensorNCxHWx<32>, int32_t, \ - cutlass::convolution::ConvType::kConvolution, \ + cutlass::conv::ConvType::kConvolution, \ cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ - cutlass::convolution::threadblock:: \ - ConvolutionNCxHWxThreadblockSwizzle< \ - cutlass::convolution::ConvType::kConvolution>, \ + cutlass::conv::threadblock:: \ + ConvolutionFpropNCxHWxThreadblockSwizzle, \ stages_, 4, aligned_, NeedLoadFromConstMem>; \ - typename Convolution::ConvolutionParameter conv_param{ \ - param.n, param.ci, param.co, param.hi, param.wi, \ - param.fh, param.fw, param.ho, param.wo, param.sh, \ - param.sw, param.ph, param.pw, 1, 1}; \ + typename Convolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ return cutlass_convolution_wrapper( \ d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ epilogue, stream); \ } #define DISPATCH_KERNEL \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ - DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ megdnn_assert(false, \ "unsupported threadblock shape (%dx%dx%d) and warp shape " \ "(%dx%dx%d)", \ diff --git a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl index 31b9c40b..cf20f616 100644 --- a/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl +++ b/dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl @@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( cudaStream_t stream) { typename Convolution::TensorRefSrc tensor_src{ const_cast(d_src), - Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), - conv_param.wi(), conv_param.ci()})}; + Convolution::LayoutSrc::packed( + {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; typename Convolution::TensorRefFilter tensor_filter{ const_cast(d_filter), - Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), - conv_param.fw(), - conv_param.ci()})}; + Convolution::LayoutFilter::packed( + {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; typename Convolution::TensorRefBias tensor_bias{ const_cast(d_bias), - Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; + Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; typename Convolution::TensorRefDst tensor_z{ const_cast(d_z), - Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), - conv_param.wo(), conv_param.co()})}; + Convolution::LayoutDst::packed( + {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; typename Convolution::TensorRefDst tensor_dst{ d_dst, - Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), - conv_param.wo(), conv_param.co()})}; - typename Convolution::Arguments arguments{ - conv_param, tensor_src, tensor_filter, - tensor_bias, tensor_z, tensor_dst.non_const_ref(), - epilogue}; + Convolution::LayoutDst::packed( + {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; + typename Convolution::Arguments arguments{conv_param, + tensor_src.non_const_ref(), + tensor_filter.non_const_ref(), + tensor_bias.non_const_ref(), + tensor_z.non_const_ref(), + tensor_dst.non_const_ref(), + epilogue}; Convolution conv_op; cutlass_check(conv_op.initialize(arguments, workspace)); cutlass_check(conv_op(stream)); diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu index 4f0eebb7..aede9980 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu index b92f7681..57f38b13 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu index e47ab052..a73b75fa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu index 30eec136..4121d19a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu index 1307f937..ebc98a40 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu index 1e095d92..c70ca160 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu index dbf1e900..9fbad0d3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu index 1ef223f9..9fd33cba 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu index 65a5fcc7..8d5cc67a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_128x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu index 0c779fe1..820c9757 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu index 85fdecd7..f3d1a463 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu index c088e6f6..7f35a20c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu index 931f3716..a952578d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu index 70949c65..da9f4985 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu index 0aaa3f7a..065b7042 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu index dc198d34..2b3a319b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu index e20f26dc..187f7b96 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu index f9e8b712..712822ff 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu index ade39f7a..4d7d9337 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu index e07c6dcf..aeafaf15 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu index 27aa11b5..9d41256d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu index 3e38529b..66a31a7f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu index 1b3145c4..ca0cf04c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu index 962dd2ff..f871dae4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_128x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu index 14226ecd..725fed7f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu index 449dee42..e7688245 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu index 0ed74669..100c4861 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu index 4d3feb12..aa5ecf00 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu index 1dbfbf60..0a1c1d3a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu index d4fcfb06..a61a466a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x64x8_16x64x8_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu index ab979349..8ac06afb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu index 4d9abe29..7237f2de 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu index b1efef36..a672e749 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x128x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu index fb6b71da..27ad631e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu index c38538c3..ddc61a51 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu index b91efded..c5aaec6f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x32x32_32x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu index c877492e..a5d437a2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu index 40e6cb5d..d16c179b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu index 19f337a1..e5968cf5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_32x64x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu index 27f7d5b8..61faa3e5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu index 1a8e17fd..b5177e01 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu index 1a4e679f..6e89d753 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu index 252a8f77..79d9c0f8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu index f5d01d75..cc5c0026 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu index a29be07e..3a71ccf0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu index 07333503..de8806d6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu index 41f6db05..b5e3a342 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu index 34896baf..2243305e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_64x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu index 5f247f0c..6110868b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu index 81cdd18c..0e5c018e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu index 0bce8bf5..33d950ec 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu index 1ffdb3c4..012b70b0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu index bf0fdcff..aff240c4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu index bb3a17ba..c101cd36 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x32x32_32x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu index 9df0b4f4..487a06fb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu index c0da1d27..450fd723 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu index 309aa239..efc249c2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_32x64x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu index 210c52e0..8503592d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu index 4f4f35f8..2d9f42b0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu index 57e4e210..d1cbc3ce 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu index 224e8a63..5c473b18 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu index 55761010..31a15cc7 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu index 170f1971..fc86fa43 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu index 03cbfbc5..2f66a3b3 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu index 4698d99f..7da1dd3f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu index 38358f7e..199c133d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_64x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu index b2c9c462..b17a6420 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu index e758d0da..af4824b2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu index f707002d..5d99e6fa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu index d1c44bec..c854ef86 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu index 5158b527..1eb4a3aa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu index 21c01d36..c56b0b48 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu index b28f6824..d07b72e9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu index f106af21..c9e2b449 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu index 7f45ef62..18c8cea8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_128x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu index 130b04b8..8f074563 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu index 42802443..b7f76e68 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu index 0a7d3c99..ae4ce865 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu index 716a540b..670b6403 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu index 35b97bcd..5a85818a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu index 8ea93a1c..18b743fc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu index 36819598..0d5fdc72 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu index ebeb8291..b0f4bd6f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu index d9b73fa2..f2ee1cdc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_128x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu index 0cd9b194..157cd07b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu index 4a89381a..eb93fa6c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu index 0011b40b..3d478931 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x128x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu index a60e3b18..99f335be 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu index 4bee34e8..643f43ca 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu index 69ab4e10..18ba41d9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x32x32_32x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu index eb4d164e..b84d3f0b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu index 576e2378..83482b72 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu index 32917d00..f66b7b28 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_32x64x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu index 367f6012..6ba333db 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu index e5ba977b..9fd0f3e9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu index c832049c..936502d6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu index 8a81f587..f3f4977a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu index e391dd64..dfe25c57 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu index 27309b70..1d341e09 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu index 026080c8..ca51165e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu index 4deac474..eb0fcc2c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu index b0a42c14..ebd3b131 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_1x1_64x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu index 48036be9..c10268e6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu index 6d1a69d2..258c39ba 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu index 91ce847d..475b3585 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x128x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu index 6d4a698f..e178f8b2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu index aeee09ac..33ab3378 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu index d601e01d..9149da9e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x32x32_32x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu index fd235e0a..447d3c9e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu index ac180bd4..aad75d8a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu index 352535b7..c70fcadf 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_32x64x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu index ccaca6e8..88d1ec1b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu index 35915b26..b414833f 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu index 1c4b308e..aa3ee9cb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu index c97b5e91..b8a5f973 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu index 8327475a..45d4a9c1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu index 1b0ba002..736fa4a1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu index cabcf1ae..afb78f94 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu index 2f111fd9..20cd8530 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu index b49faba5..c4d2fc3c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32_64x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu index 35736675..889ded1a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu index c447d319..c7e9abba 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu index 898466e1..e0e825fa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu index 83dd4204..52c3111b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu index 5dc680a7..5ba1e043 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu index bbf92f63..2c66f76a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu index faa8ea18..bec858a1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu index 7e4273a7..cd029a0c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu index 7fa35c72..38dc02c2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_128x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu index 8865080e..7c3d5532 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu index 6a93ab61..af59cd30 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu index 6e01aaaf..0b447b2b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu index ec78da97..d8832567 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu index 27df4ce8..9c5f1ec0 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu index b08e9563..86d700c6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x64x8_16x64x8_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu index 88d5dad6..63e1b138 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu index 021eee46..5a83822c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu index 540f13ec..ad0340f1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu index d9fa27c5..3bed635e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu index bd9b9db7..69d3d131 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu index 3581ac6b..316a3072 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu index 2a13f48c..1070d213 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu index 33a1b07f..7b591fa4 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu index 2e86a099..4c7bcb84 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_128x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu index ddea6211..080c3222 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu index 0aafd7fe..8b82f9a8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu index 0809ddee..cdcd6b85 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 1, 4, 8, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu index 56c22fc6..5c8ee549 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu index 68a8a13b..10bb5915 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu index 8ca682f4..6a46eeac 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x64x8_16x64x8_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 4, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu index b6923f4a..90534a18 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu index 75acf999..9eb74ad1 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu index 7248da5a..d470b22e 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x128x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu index b8c1e278..6b394e11 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu index 393b5a6c..652cea88 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu index c1ae6410..5fc01557 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x32x32_32x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu index 2d327f97..9d743b45 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu index 33332c77..0921c3e9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu index 37369d3e..a9c8dd40 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_32x64x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu index 5ffd599e..23443ee2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu index 2fea2ac0..ac202d09 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu index 883b755d..ce911fc9 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu index 3cfe915f..1d2b47d2 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu index 6b711dc1..d38622ac 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu index c6340646..3845f7fa 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu index e1a4fac1..2948cbbc 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu index 8b2c0d5f..0f75f986 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu index eeb3c5c2..4aa41a48 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_64x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, false, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu index b04889b8..3a21872a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu index 03d817af..86e0bfb5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu index 86009742..265d639c 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x128x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu index 22a9bfb1..da540ae8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu index e1e20011..74c608a8 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu index d9b963e0..c8e8d915 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x32x32_32x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu index 4ad86a3f..0dca550d 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu index 6f7648c3..989d5592 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu index f3f8802b..684e5777 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_32x64x32_32x64x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu index 497591db..0d47d1cd 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu index b06de5af..d8927fbb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu index 28aa9131..5115a05b 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x128x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu index 87e5aae0..d448a3e6 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu index 0f5d5d87..8e71d8ca 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu index daadee35..d09c6171 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x32x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu index 6f19370e..7aad8acb 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu index ca6f680b..3774df1a 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu index b7eccd8c..e23ea6f5 100644 --- a/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu +++ b/dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_64x64x32_64x32x32_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< float, 1, int32_t, float, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, float, LayoutDst, float, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 4, 16, true, cutlass::arch::OpMultiplyAdd>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu index 42029b8a..b5817415 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu index 9de29a4a..7a0612f8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu index a91bd76d..cedd1f1b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu index a156009f..d68997d0 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu index 645fb0f6..d707e5f7 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu index b6451a72..0eb96352 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x256x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu index 0d5dc8c4..3dcf0497 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu index dd1e9c2e..e9dc07cc 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu index 96f54e5d..c5653626 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_128x64x64_64x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu index cdef6ce7..f51466f5 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu index b576c796..7ae41fd2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu index 7d6d5092..7e28c5bd 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu index b504399a..bc75f547 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu index e1f9eaa2..caeaba01 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu index 03f90194..af6c2a3c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x256x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu index 006f790a..32294cd5 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu index 0906068e..34f00363 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu index adcbfbea..eea42928 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_128x64x64_64x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu index 31bac99f..7f14fddc 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu index d35a2be8..ddc4869d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu index 272c7249..b7d0aba9 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_256x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu index 7315cecb..dee9e38c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu index 9d4b8466..33939b7b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu index 6a87dc00..8926a9d6 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_32x64x64_32x16x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu index f234109f..2e9eda95 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu index 8846c82f..7e9fc7a4 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu index 54a30792..00ed690b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x128x64_32x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu index df3571a2..e644b166 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu index deba71bb..f870aed5 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu index 2392dc7b..d6bebaf2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_1x1_64x64x64_32x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu index fe0b5849..33027d35 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu index 61ac1b49..adca5a47 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu index 4ea0b892..9ca3b541 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_256x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu index ffef0da2..89b80552 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu index aa544e2d..30e53fc0 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu index 762bf3e9..465fe265 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_32x64x64_32x16x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 16, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu index bad9a5ce..00fb0d97 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu index b08d7a95..1b2402de 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu index 9173c904..647694ce 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x128x64_32x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu index ceb38356..54cca4c8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu index 7f1b55c0..48c0ed8f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu index c599f7ac..bbfd2311 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_64x64x64_32x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 8, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu index e11f92ff..b9333b45 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu index bf638233..0ed1e56a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu index 5ef27dfd..9ffd151c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu index f46311f2..5371a42a 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu index f9b97cd4..c152bd5e 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu index 3ac0bd36..dfe97c7f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x256x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu index c426dddf..9db800d3 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu index 9fe447f3..4f0e90ad 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu index 198293e2..c664ccd8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_128x64x64_64x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu index 1d4a974f..ea9cedcb 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu index 069e96db..9652ea56 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu index 80a947c2..1beff87f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu index 5fe0f19c..876e0dae 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu index ca80dda2..e86c5238 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu index 812cdc81..36b3b3c8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x256x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu index 0b0da908..4e115ae8 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu index 89608b5b..a7829ddc 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu index 80aac1fc..e9cd6b18 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_128x64x64_64x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu index 338226f8..b98a9376 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu index bde7fed6..f8ca8877 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu index 5cc092b6..88420f6d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_256x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu index b49cdc6b..34234df5 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu index 6957270f..c69170cf 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu index 28960bc4..ed347534 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_32x64x64_16x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu index 99ff27cd..8a9c6cdc 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu index c913c159..1d081f4c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu index 5144b3bc..997cda70 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x128x64_32x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu index ef7e3b9d..b27c8248 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu index adb02359..07d0fcd2 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu index b3df1baf..94167d78 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_1x1_64x64x64_32x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, false, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu index a0a2a8be..c27f507c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu index 89645723..44c2437d 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu index 513ff269..bd2802ee 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_256x128x64_64x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu index c9e96d49..ce9e3b03 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu index 20b9cddc..ff5f6d01 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu index 41cd9ae5..a0786d6c 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_32x64x64_16x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu index 3151e3dc..964f4219 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu index c20fe3d0..c6dc75cc 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu index dddbed50..e8c765ca 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x128x64_32x64x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu index 6028fba1..e0cbba9b 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_hswish.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu index 0ea06e92..0ff0f65f 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_id.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu index 688064da..822b0479 100644 --- a/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu +++ b/dnn/src/cuda/conv_bias/int8_imma/kimpl/conv_bias_int8_implicit_gemm_imma_ncdiv32hw32_ncdiv4hw4_64x64x64_32x32x64_relu.cu @@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 64>; using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< int8_t, 4, int32_t, int32_t, float>; -using Convolution = cutlass::convolution::device::Convolution< +using Convolution = cutlass::conv::device::Convolution< int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, LayoutDst, int32_t, LayoutDst, int32_t, - cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, + cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, - cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< - cutlass::convolution::ConvType::kConvolution>, + cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, 2, 16, 16, true, cutlass::arch::OpMultiplyAddSaturate>; template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( diff --git a/dnn/src/cuda/convolution/backward_data/algo.cpp b/dnn/src/cuda/convolution/backward_data/algo.cpp index dcdcd589..c643db1d 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.cpp +++ b/dnn/src/cuda/convolution/backward_data/algo.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" @@ -20,32 +21,38 @@ ConvolutionBackwardDataImpl::AlgoPack::AlgoPack() { non_cudnn_algos.push_back(&chanwise_small); non_cudnn_algos.push_back(&matmul); - all_algos.push_back(&chanwise); // prefer chanwise - all_algos.push_back(&chanwise_small); // prefer small chanwise + all_algos.push_back(&chanwise); // prefer chanwise + all_algos.push_back(&chanwise_small); // prefer small chanwise fill_cudnn_algos(); - for (auto &&i: cudnn) { + for (auto&& i : cudnn) { all_algos.push_back(&i); } all_algos.push_back(&matmul); + fill_int8_dp4a_algos(); + for (auto&& algo : int8_nchw4_dotprod) { + all_algos.push_back(&algo); + int8_algos.push_back(&algo); + } + all_algos.reserve(all_algos.size() * 2); // add gconv algos by AlgoGroupConvGeneral auto all_algos_data = all_algos.data(); size_t group_algo_start = 2; - for (size_t i = group_algo_start; i < all_algos.size(); ++ i) { + for (size_t i = group_algo_start; i < all_algos.size(); ++i) { gconv.push_back({all_algos[i]}); } - for (size_t i = group_algo_start; i < all_algos.size(); ++ i) { + for (size_t i = group_algo_start; i < all_algos.size(); ++i) { algo2gconv[all_algos[i]] = &gconv[i - group_algo_start]; } - for (auto &&i: gconv) { + for (auto&& i : gconv) { all_algos.push_back(&i); } megdnn_assert(all_algos_data == all_algos.data()); - non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul + non_cudnn_algos.push_back(all_algos.rbegin()[0]); // group matmul all_algos.push_back(&bfloat16); bfloat16_algos.push_back(&bfloat16); @@ -59,63 +66,55 @@ MEGDNN_DEF_GET_ALGO_FROM_DESC(ConvolutionBackwardDataImpl) ConvolutionBackwardDataImpl::AlgoCUDNN* ConvolutionBackwardDataImpl::AlgoPack::cudnn_from_enum( cudnnConvolutionBwdDataAlgo_t algo) { - for (auto &&i: cudnn) { + for (auto&& i : cudnn) { if (i.cudnn_enum() == algo) return &i; } - megdnn_throw(megdnn_mangle(ssprintf( - "can not find cudnn bwd_data algorithm %d", - static_cast(algo)))); + megdnn_throw( + megdnn_mangle(ssprintf("can not find cudnn bwd_data algorithm %d", + static_cast(algo)))); } ConvolutionBackwardDataImpl::AlgoPack ConvolutionBackwardDataImpl::sm_algo_pack; ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardDataImpl *o, - const TensorLayout &filter, const TensorLayout &diff, - const TensorLayout &grad): - SizeArgs(o, filter, o->check_layout_fwd(grad, filter, diff), diff, grad) -{ -} + ConvolutionBackwardDataImpl* o, const TensorLayout& filter, + const TensorLayout& diff, const TensorLayout& grad) + : SizeArgs(o, filter, o->check_layout_fwd(grad, filter, diff), diff, + grad) {} ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::SizeArgs( - ConvolutionBackwardDataImpl *o, const TensorLayout& filter, - const CanonizedFilterMeta &filter_meta, const TensorLayout &diff, - const TensorLayout &grad): - handle{concrete_handle(o->handle())}, - filter_meta{filter_meta}, - diff_layout{&diff}, - grad_layout{&grad}, - filter_layout{&filter}, - opr{o} -{ -} + ConvolutionBackwardDataImpl* o, const TensorLayout& filter, + const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, + const TensorLayout& grad) + : handle{concrete_handle(o->handle())}, + filter_meta{filter_meta}, + diff_layout{&diff}, + grad_layout{&grad}, + filter_layout{&filter}, + opr{o} {} ConvolutionBackwardDataImpl::AlgoBase::ExecArgs::ExecArgs( - ConvolutionBackwardDataImpl *opr, - _megdnn_tensor_in filter, - _megdnn_tensor_in diff, - _megdnn_tensor_out grad, - _megdnn_workspace workspace): - SizeArgs(opr, filter.layout, diff.layout, grad.layout), - filter_tensor{&filter}, diff_tensor{&diff}, grad_tensor{&grad}, - workspace{workspace} -{ -} + ConvolutionBackwardDataImpl* opr, _megdnn_tensor_in filter, + _megdnn_tensor_in diff, _megdnn_tensor_out grad, + _megdnn_workspace workspace) + : SizeArgs(opr, filter.layout, diff.layout, grad.layout), + filter_tensor{&filter}, + diff_tensor{&diff}, + grad_tensor{&grad}, + workspace{workspace} {} std::string ConvolutionBackwardDataImpl::AlgoBase::SizeArgs::to_string() const { - auto &&fm = filter_meta; + auto&& fm = filter_meta; MEGDNN_MARK_USED_VAR(fm); return megdnn_mangle(ssprintf( - "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, " - "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", - fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], - diff_layout->to_string().c_str(), - grad_layout->to_string().c_str(), - fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], - fm.dilation[0], fm.dilation[1], - !fm.should_flip, - diff_layout->dtype.name(), grad_layout->dtype.name())); + "filter=%u{%u,%u,%u,%u}, diff=%s, grad=%s, " + "pad=%ux%u, stride=%ux%u, dilate=%ux%u, xcorr=%d, dtype=%s,%s", + fm.group, fm.ocpg, fm.icpg, fm.spatial[0], fm.spatial[1], + diff_layout->to_string().c_str(), grad_layout->to_string().c_str(), + fm.padding[0], fm.padding[1], fm.stride[0], fm.stride[1], + fm.dilation[0], fm.dilation[1], !fm.should_flip, + diff_layout->dtype.name(), grad_layout->dtype.name())); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/algo.h b/dnn/src/cuda/convolution/backward_data/algo.h index e3cd9f94..0f2a1e47 100644 --- a/dnn/src/cuda/convolution/backward_data/algo.h +++ b/dnn/src/cuda/convolution/backward_data/algo.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -38,6 +39,7 @@ public: CUDA_CHANWISE_SMALL, CUDA_BFLOAT16, CUDA_GROUP_CONV_GENERAL, + CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8 }; using Mapper = std::unordered_map; @@ -240,9 +242,53 @@ public: } }; +class ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm final + : public AlgoBase { +public: + struct AlgoParam { + int threadblock_m; + int threadblock_n; + int threadblock_k; + int warp_m; + int warp_n; + int warp_k; + int stage; + std::string to_string() { + /// default algorithm + if (threadblock_m == 128 && threadblock_n == 128 && + threadblock_k == 32 && warp_m == 32 && warp_n == 64 && + warp_k == 32 && stage == 2) { + return ""; + } + return ssprintf("_%dX%dX%d_%dX%dX%d_%dstage", threadblock_m, + threadblock_n, threadblock_k, warp_m, warp_n, + warp_k, stage); + } + }; + AlgoInt8NCHW4DotProdImplicitGemm(AlgoParam algo_param) + : m_algo_param{algo_param}, + m_name{ssprintf("INT8_NCHW4_DOTPROD_IMPLICIT_GEMM%s", + m_algo_param.to_string().c_str())} {} + bool is_available(const SizeArgs& args) const override; + size_t get_workspace_in_bytes(const SizeArgs& args) const override; + void exec(const ExecArgs& args) const override; + const char* name() const override { return m_name.c_str(); } + AlgoAttribute attribute() const override { + return AlgoAttribute::REPRODUCIBLE; + } + MEGDNN_DECL_ALGO_TYPE(CUDA_IMPLICIT_GEMM_NCHW4_DOTPROD_INT8) +private: + WorkspaceBundle get_workspace_bundle(dt_byte* raw_ptr, + const SizeArgs& args) const; + AlgoParam m_algo_param; + std::string m_name; +}; + class ConvolutionBackwardDataImpl::AlgoPack : NonCopyableObj { // defined in cudnn.cpp void fill_cudnn_algos(); + // defined in implicit_gemm_int8_nchw4_dp4a.cpp + void fill_int8_dp4a_algos(); AlgoBase::Mapper m_all_algos_map; @@ -256,12 +302,13 @@ public: std::vector gconv; std::unordered_map algo2gconv; AlgoBFloat16 bfloat16; + std::vector int8_nchw4_dotprod; std::vector //! all algorithms all_algos, //! non-cudnn algos, used for heuristic if cudnn is not supported - non_cudnn_algos, bfloat16_algos; + non_cudnn_algos, bfloat16_algos, int8_algos; AlgoCUDNN* cudnn_from_enum(cudnnConvolutionBwdDataAlgo_t algo); diff --git a/dnn/src/cuda/convolution/backward_data/chanwise.cpp b/dnn/src/cuda/convolution/backward_data/chanwise.cpp index f6272fca..89562f4d 100644 --- a/dnn/src/cuda/convolution/backward_data/chanwise.cpp +++ b/dnn/src/cuda/convolution/backward_data/chanwise.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" @@ -19,8 +20,10 @@ using namespace convolution; bool ConvolutionBackwardDataImpl::AlgoChanwise::is_available( const SizeArgs& args) const { - if (args.diff_layout->dtype == args.filter_layout->dtype && - args.diff_layout->dtype == dtype::BFloat16()) { + if ((args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::BFloat16()) || + (args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::QuantizedS8())) { return false; } auto&& fm = args.filter_meta; @@ -74,4 +77,3 @@ void ConvolutionBackwardDataImpl::AlgoChanwise::exec( } // vim: syntax=cpp.doxygen - diff --git a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp index cc5b12e2..bb8ba182 100644 --- a/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp +++ b/dnn/src/cuda/convolution/backward_data/chanwise_small.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/convolution/backward_data/algo.h" @@ -28,9 +29,11 @@ inline bool is_available_small(const chanwise::Param& param) { } // anonymous namespace bool ConvolutionBackwardDataImpl::AlgoChanwiseSmall::is_available( - const SizeArgs &args) const { - if (args.diff_layout->dtype == args.filter_layout->dtype && - args.diff_layout->dtype == dtype::BFloat16()) { + const SizeArgs& args) const { + if ((args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::BFloat16()) || + (args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::QuantizedS8())) { return false; } #if CUDA_VERSION < 9000 @@ -38,30 +41,29 @@ bool ConvolutionBackwardDataImpl::AlgoChanwiseSmall::is_available( return false; #endif auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); - auto &&fm = args.filter_meta; + auto&& fm = args.filter_meta; return args.filter_meta.format == Param::Format::NCHW && - args.diff_layout->dtype.category() == DTypeCategory::FLOAT && + args.diff_layout->dtype.category() == DTypeCategory::FLOAT && args.opr->param().compute_mode == Param::ComputeMode::DEFAULT && - fm.spatial_ndim == 2 && fm.icpg == 1 && - fm.dilation[0] == 1 && fm.dilation[1] == 1 && - !fm.should_flip && is_available_small(kparam); + fm.spatial_ndim == 2 && fm.icpg == 1 && fm.dilation[0] == 1 && + fm.dilation[1] == 1 && !fm.should_flip && is_available_small(kparam); } size_t ConvolutionBackwardDataImpl::AlgoChanwiseSmall::get_workspace_in_bytes( - const SizeArgs &) const { + const SizeArgs&) const { return 0; } void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec( - const ExecArgs &args) const { + const ExecArgs& args) const { auto kparam = chanwise::Param::from_fwd_args(args.as_fwd_args()); auto stream = cuda_stream(args.handle); switch (args.grad_layout->dtype.enumv()) { case DTypeEnum::Float32: - return chanwise::run_bwd_data_small(args.grad_tensor->ptr(), - args.diff_tensor->ptr(), - args.filter_tensor->ptr(), kparam, - stream); + return chanwise::run_bwd_data_small( + args.grad_tensor->ptr(), + args.diff_tensor->ptr(), + args.filter_tensor->ptr(), kparam, stream); #if CUDA_VERSION >= 9000 case DTypeEnum::Float16: return chanwise::run_bwd_data_small( @@ -77,4 +79,3 @@ void ConvolutionBackwardDataImpl::AlgoChanwiseSmall::exec( } // vim: syntax=cpp.doxygen - diff --git a/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu new file mode 100644 index 00000000..1d054ae4 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu @@ -0,0 +1,100 @@ +/** + * \file src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" + +#if !MEGDNN_TEGRA_X1 +#include "cutlass/convolution/device/convolution.h" +#endif +#include "src/common/opr_param_defs_enumv.cuh" +#include "src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh" +#pragma GCC diagnostic pop + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +/* ================ cutlass kernel wrapper for nchw4 layout ================= */ +#if MEGDNN_TEGRA_X1 +void megdnn::cuda::cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + const int8_t* /* d_src */, const int8_t* /* d_filter */, + int8_t* /* d_dst */, int* /* workspace */, + const convolution::ConvParam& /* param */, float /* alpha */, + const GemmCoord& /* threadblock_shape */, + const GemmCoord& /* warp_shape */, int /* stages */, + cudaStream_t /* stream */) {} +#else +void megdnn::cuda::cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, float alpha, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + int stages, cudaStream_t stream) { +#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ + threadblock_k_, warp_m_, warp_n_, \ + warp_k_, stage_, aligned_) \ + if (threadblock_shape.m() == threadblock_m_ && \ + threadblock_shape.n() == threadblock_n_ && \ + threadblock_shape.k() == threadblock_k_ && \ + warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ + warp_shape.k() == warp_k_ && stages == stage_) { \ + using ThreadBlockShape = \ + cutlass::gemm::GemmShape; \ + using WarpShape = cutlass::gemm::GemmShape; \ + using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ + using Deconvolution = cutlass::conv::device::Deconvolution< \ + int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ + cutlass::layout::TensorKxRSCx<4>, ElementOutput, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::layout::TensorNCxHWx<4>, int32_t, \ + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ + cutlass::conv::threadblock:: \ + ConvolutionDgradNCxHWxThreadblockSwizzle, \ + stage_, 4, aligned_>; \ + typename Deconvolution::ConvolutionParameter conv_param( \ + param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ + param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ + param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ + return cutlass_deconvolution_wrapper( \ + d_src, d_filter, nullptr, nullptr, d_dst, workspace, \ + conv_param, epilogue, stream); \ + } +#define DISPATCH_KERNEL \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 64, 16, 2, 4); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ + DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ + megdnn_assert(false, \ + "unsupported threadblock shape (%dx%dx%d) and warp shape " \ + "(%dx%dx%d)", \ + threadblock_shape.m(), threadblock_shape.n(), \ + threadblock_shape.k(), warp_shape.m(), warp_shape.n(), \ + warp_shape.k()); + using ElementOutput = int8_t; + using ElementAccumulator = int32_t; + using ElementBias = int32_t; + using ElementCompute = float; + using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + ElementOutput, 4, ElementAccumulator, ElementBias, ElementCompute>; + typename EpilogueOp::Params epilogue{alpha, 0, 0}; + DISPATCH_KERNEL; + +#undef DISPATCH_KERNEL_WITH_TILE_SHAPE +#undef DISPATCH_KERNEL +} +#endif + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh new file mode 100644 index 00000000..35961673 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh @@ -0,0 +1,44 @@ +/** + * \file src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "cutlass/gemm/gemm.h" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace cutlass_wrapper { + +using GemmCoord = cutlass::gemm::GemmCoord; + +template +void cutlass_deconvolution_wrapper( + const typename Convolution::ElementSrc* d_src, + const typename Convolution::ElementFilter* d_filter, + const typename Convolution::ElementBias* d_bias, + const typename Convolution::ElementDst* d_z, + typename Convolution::ElementDst* d_dst, int* workspace, + typename Convolution::ConvolutionParameter const& conv_param, + typename Convolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); + +void do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + const int8_t* d_src, const int8_t* d_filter, int8_t* d_dst, + int* workspace, const convolution::ConvParam& param, float alpha, + const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, + int stages, cudaStream_t stream); + +} // namespace cutlass_wrapper +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu new file mode 100644 index 00000000..f3d284c3 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cu @@ -0,0 +1,76 @@ +/** + * \file src/cuda/convolution/backward_data/deconv_int8_helper.cu + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/cuda/convolution/backward_data/deconv_int8_helper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace deconv; + +#define BLOCKSIZE_X 16 +#define BLOCKSIZE_Y 16 + +namespace { + +// +__global__ void reorder_filter_nc4hw4_to_n4hwc4_kernel( + int8_t* __restrict__ dst, const int8_t* __restrict__ src, uint32_t OC, + uint32_t IC, uint32_t FHFW) { + const int32_t ocb = blockIdx.z; + const int32_t icb = blockIdx.y * BLOCKSIZE_X + threadIdx.y; + const int32_t fhfw = blockIdx.x * BLOCKSIZE_Y + threadIdx.x; + + if (fhfw < FHFW && icb < IC / 4) { + int src0 = *reinterpret_cast( + src + (ocb * 4 + 0) * IC * FHFW + (icb * FHFW + fhfw) * 4); + int src1 = *reinterpret_cast( + src + (ocb * 4 + 1) * IC * FHFW + (icb * FHFW + fhfw) * 4); + int src2 = *reinterpret_cast( + src + (ocb * 4 + 2) * IC * FHFW + (icb * FHFW + fhfw) * 4); + int src3 = *reinterpret_cast( + src + (ocb * 4 + 3) * IC * FHFW + (icb * FHFW + fhfw) * 4); + // transpose 4x4 + int dst01_lo = __byte_perm(src0, src1, 0x5140); + int dst01_hi = __byte_perm(src0, src1, 0x7362); + int dst23_lo = __byte_perm(src2, src3, 0x5140); + int dst23_hi = __byte_perm(src2, src3, 0x7362); + int dst0 = __byte_perm(dst01_lo, dst23_lo, 0x5410); + int dst1 = __byte_perm(dst01_lo, dst23_lo, 0x7632); + int dst2 = __byte_perm(dst01_hi, dst23_hi, 0x5410); + int dst3 = __byte_perm(dst01_hi, dst23_hi, 0x7632); + + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 0) * 4) = dst0; + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 1) * 4) = dst1; + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 2) * 4) = dst2; + *reinterpret_cast( + dst + (ocb * FHFW * IC + fhfw * IC + icb * 4 + 3) * 4) = dst3; + } +} + +} // namespace + +void megdnn::cuda::deconv::reorder_filter_nc4hw4_to_n4hwc4( + int8_t* dst, const int8_t* src, uint32_t OC, uint32_t IC, uint32_t FH, + uint32_t FW, cudaStream_t stream) { + dim3 threads(BLOCKSIZE_X, BLOCKSIZE_Y, 1); + dim3 blocks(DIVUP(FH * FW, BLOCKSIZE_X), DIVUP(IC / 4, BLOCKSIZE_Y), + OC / 4); + + reorder_filter_nc4hw4_to_n4hwc4_kernel<<>>( + dst, src, OC, IC, FH * FW); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh new file mode 100644 index 00000000..f50b3c36 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/deconv_int8_helper.cuh @@ -0,0 +1,27 @@ +/** + * \file src/cuda/convolution/backward_data/deconv_int8_helper.cuh + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#pragma once +#include "src/cuda/utils.cuh" + +namespace megdnn { +namespace cuda { +namespace deconv { + +void reorder_filter_nc4hw4_to_n4hwc4(int8_t* dst, const int8_t* src, + uint32_t OC, uint32_t IC, uint32_t FH, + uint32_t FW, cudaStream_t stream); + +} // namespace deconv +} // namespace cuda +} // namespace megdnn + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/group_conv.cpp b/dnn/src/cuda/convolution/backward_data/group_conv.cpp index e4df3ac8..c2e3b9f4 100644 --- a/dnn/src/cuda/convolution/backward_data/group_conv.cpp +++ b/dnn/src/cuda/convolution/backward_data/group_conv.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "./algo.h" @@ -16,8 +17,8 @@ using namespace cuda; using namespace convolution; void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( - ConvolutionBackwardDataImpl::AlgoBase::SizeArgs &args, - TensorLayout &diff_pg, TensorLayout &grad_pg) { + ConvolutionBackwardDataImpl::AlgoBase::SizeArgs& args, + TensorLayout& diff_pg, TensorLayout& grad_pg) { diff_pg = *args.diff_layout; grad_pg = *args.grad_layout; auto nr_grp = args.filter_meta.group; @@ -29,17 +30,18 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::modify_size_args( } ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::AlgoGroupConvGeneral( - AlgoBase *impl): - m_impl{impl} -{ + AlgoBase* impl) + : m_impl{impl} { m_name = "group_conv:"; m_name += impl->name(); } bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( - const SizeArgs &args) const { - if (args.diff_layout->dtype == args.filter_layout->dtype && - args.diff_layout->dtype == dtype::BFloat16()) { + const SizeArgs& args) const { + if ((args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::BFloat16()) || + (args.diff_layout->dtype == args.filter_layout->dtype && + args.diff_layout->dtype == dtype::QuantizedS8())) { return false; } auto sub_args = args; @@ -48,8 +50,9 @@ bool ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::is_available( return m_impl->is_available(sub_args); } -size_t ConvolutionBackwardDataImpl::AlgoGroupConvGeneral:: -get_workspace_in_bytes(const SizeArgs &args) const { +size_t +ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::get_workspace_in_bytes( + const SizeArgs& args) const { auto sub_args = args; TensorLayout diff_pg, grad_pg; modify_size_args(sub_args, diff_pg, grad_pg); @@ -57,24 +60,24 @@ get_workspace_in_bytes(const SizeArgs &args) const { } void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( - const ExecArgs &args) const { + const ExecArgs& args) const { auto sub_args = args; TensorND tflt{*args.filter_tensor}, tdiff{*args.diff_tensor}, - tgrad{*args.grad_tensor}; + tgrad{*args.grad_tensor}; modify_size_args(sub_args, tdiff.layout, tgrad.layout); sub_args.filter_tensor = &tflt; sub_args.diff_tensor = &tdiff; sub_args.grad_tensor = &tgrad; auto grp = args.filter_meta.group; - auto &&fm = args.filter_meta; - auto strd_flt = (fm.icpg * fm.ocpg * - fm.spatial[0] * fm.spatial[1] * tflt.layout.dtype.size()), - strd_diff = ( - tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), - strd_grad = ( - tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); - for (uint32_t g = 0; g < grp; ++ g) { + auto&& fm = args.filter_meta; + auto strd_flt = (fm.icpg * fm.ocpg * fm.spatial[0] * fm.spatial[1] * + tflt.layout.dtype.size()), + strd_diff = + (tdiff.layout.stride[1] * fm.ocpg * tdiff.layout.dtype.size()), + strd_grad = + (tgrad.layout.stride[1] * fm.icpg * tgrad.layout.dtype.size()); + for (uint32_t g = 0; g < grp; ++g) { m_impl->exec(sub_args); incr_voidp(tflt.raw_ptr, strd_flt); incr_voidp(tdiff.raw_ptr, strd_diff); @@ -83,4 +86,3 @@ void ConvolutionBackwardDataImpl::AlgoGroupConvGeneral::exec( } // vim: syntax=cpp.doxygen - diff --git a/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp new file mode 100644 index 00000000..9cb8e647 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/implicit_gemm_int8_nchw4_dp4a.cpp @@ -0,0 +1,127 @@ +/** + * \file dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "./algo.h" +#include "src/cuda/utils.h" +#include "src/cuda/convolution_helper/parameter.cuh" +#include "src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh" +#include "src/cuda/convolution/backward_data/deconv_int8_helper.cuh" + +using namespace megdnn; +using namespace cuda; + +bool ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + is_available(const SizeArgs& args) const { + auto&& fm = args.filter_meta; + if (fm.format != Param::Format::NCHW4) + return false; + + bool available = true; + + auto src_dtype = args.diff_layout->dtype, + filter_dtype = args.filter_layout->dtype, + dst_dtype = args.grad_layout->dtype; + + available &= (src_dtype.enumv() == DTypeEnum::QuantizedS8 && + filter_dtype.enumv() == DTypeEnum::QuantizedS8 && + dst_dtype.enumv() == DTypeEnum::QuantizedS8); + // TODO support group deconv int8 + available &= (fm.group == 1); + // mode must be cross correlation + available &= !fm.should_flip; + // mode must be 2D + available &= fm.spatial_ndim == 2; + // TODO: support dialtion + available &= (fm.dilation[0] == 1 && fm.dilation[1] == 1); + // FIXME: too large filter size is not supported now + available &= fm.spatial[0] * fm.spatial[1] <= 64; + // only support sm_61 or later, platform should have fast native int8 + // support + available &= is_compute_capability_required(6, 1); + + return available; +} + +WorkspaceBundle ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + get_workspace_bundle(dt_byte* raw_ptr, const SizeArgs& args) const { + size_t ws_filter = args.filter_layout->span().dist_byte(); + return WorkspaceBundle{raw_ptr, {ws_filter}}; +} + +size_t ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm:: + get_workspace_in_bytes(const SizeArgs& args) const { + return get_workspace_bundle(nullptr, args).total_size_in_bytes(); +} + +void ConvolutionBackwardDataImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( + const ExecArgs& args) const { + auto&& fm = args.filter_meta; + size_t n = args.diff_layout->operator[](0), + co = args.diff_layout->operator[](1) * 4, + ho = args.diff_layout->operator[](2), + wo = args.diff_layout->operator[](3); + size_t ci = args.grad_layout->operator[](1) * 4, + hi = args.grad_layout->operator[](2), + wi = args.grad_layout->operator[](3); + size_t fh = fm.spatial[0], fw = fm.spatial[1]; + size_t sh = fm.stride[0], sw = fm.stride[1]; + size_t ph = fm.padding[0], pw = fm.padding[1]; + + auto&& stream = cuda_stream(args.opr->handle()); + + int8_t* filter_ptr = nullptr; + // TODO: weight preprocess + { + filter_ptr = reinterpret_cast(args.workspace.raw_ptr); + // reformat filter from nc4hw4 to n4hwc4 + megdnn::cuda::deconv::reorder_filter_nc4hw4_to_n4hwc4( + filter_ptr, args.filter_tensor->compatible_ptr(), co, + ci, fh, fw, stream); + } + convolution::ConvParam kern_param; + kern_param.n = n, kern_param.co = co, kern_param.ci = ci, + kern_param.hi = hi, kern_param.wi = wi, kern_param.ho = ho, + kern_param.wo = wo, kern_param.ph = ph, kern_param.pw = pw, + kern_param.sh = sh, kern_param.sw = sw, kern_param.fh = fh, + kern_param.fw = fw; + + float diff_scale = + args.diff_layout->dtype.param().scale, + filter_scale = + args.filter_layout->dtype.param().scale, + grad_scale = + args.grad_layout->dtype.param().scale; + float alpha = diff_scale * filter_scale / grad_scale; + cutlass_wrapper::do_deconv_int8_implicit_gemm_dp4a_ncdiv4hw4( + args.diff_tensor->compatible_ptr(), filter_ptr, + args.grad_tensor->compatible_ptr(), nullptr, kern_param, + alpha, + cutlass_wrapper::GemmCoord{m_algo_param.threadblock_m, + m_algo_param.threadblock_n, + m_algo_param.threadblock_k}, + cutlass_wrapper::GemmCoord{m_algo_param.warp_m, m_algo_param.warp_n, + m_algo_param.warp_k}, + m_algo_param.stage, stream); + + after_kernel_launch(); +} + +void ConvolutionBackwardDataImpl::AlgoPack::fill_int8_dp4a_algos() { + using AlgoParam = AlgoInt8NCHW4DotProdImplicitGemm::AlgoParam; + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 64, 8, 16, 64, 8, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 64, 16, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{16, 128, 16, 16, 128, 16, 1}); + int8_nchw4_dotprod.emplace_back(AlgoParam{32, 128, 32, 32, 64, 32, 2}); + int8_nchw4_dotprod.emplace_back(AlgoParam{64, 128, 32, 64, 32, 32, 2}); +} + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl b/dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl new file mode 100644 index 00000000..f5382f91 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl @@ -0,0 +1,62 @@ +/** + * \file + * dnn/src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ +#include "cutlass/convolution/device/convolution.h" +#include "src/cuda/convolution/backward_data/cutlass_deconvolution_wrapper.cuh" + +using namespace megdnn; +using namespace cuda; +using namespace cutlass_wrapper; + +template +void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream) { + typename Deconvolution::TensorRefSrc tensor_src{ + const_cast(d_src), + Deconvolution::LayoutSrc::packed( + {conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; + typename Deconvolution::TensorRefFilter tensor_filter{ + const_cast(d_filter), + Deconvolution::LayoutFilter::packed( + {conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; + typename Deconvolution::TensorRefBias tensor_bias{ + const_cast(d_bias), + Deconvolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; + typename Deconvolution::TensorRefDst tensor_z{ + const_cast(d_z), + Deconvolution::LayoutDst::packed( + {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; + typename Deconvolution::TensorRefDst tensor_dst{ + d_dst, + Deconvolution::LayoutDst::packed( + {conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; + typename Deconvolution::Arguments arguments{conv_param, + tensor_src.non_const_ref(), + tensor_filter.non_const_ref(), + tensor_bias.non_const_ref(), + tensor_z.non_const_ref(), + tensor_dst.non_const_ref(), + epilogue}; + Deconvolution deconv_op; + cutlass_check(deconv_op.initialize(arguments, workspace)); + cutlass_check(deconv_op(stream)); + after_kernel_launch(); +} + +// vim: syntax=cuda.doxygen diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu new file mode 100644 index 00000000..296c397a --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu @@ -0,0 +1,36 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorKxRSCx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Deconvolution = cutlass::conv::device::Deconvolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, + 1, 4, 8, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, + int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu new file mode 100644 index 00000000..57730346 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x64x16_id.cu @@ -0,0 +1,36 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorKxRSCx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 16>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Deconvolution = cutlass::conv::device::Deconvolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, + 2, 4, 4, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, + int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu new file mode 100644 index 00000000..a22525cd --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_16x64x8_16x64x8_id.cu @@ -0,0 +1,36 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorKxRSCx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<16, 64, 8>; +using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Deconvolution = cutlass::conv::device::Deconvolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, + 2, 4, 4, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, + int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu new file mode 100644 index 00000000..400b5db2 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_32x128x32_32x64x32_id.cu @@ -0,0 +1,36 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorKxRSCx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<32, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Deconvolution = cutlass::conv::device::Deconvolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, + int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu new file mode 100644 index 00000000..c149a8e1 --- /dev/null +++ b/dnn/src/cuda/convolution/backward_data/int8/kimpl/deconv_int8_implicit_gemm_dp4a_ncdiv4hw4_64x128x32_64x32x32_id.cu @@ -0,0 +1,36 @@ +#if !MEGDNN_TEGRA_X1 +// generated by gen_cuda_conv_bias_kern_impls.py +// ignore warning of cutlass +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wunused-parameter" +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#include "src/cuda/convolution/backward_data/int8/deconv_int8_implicit_gemm_cutlass_wrapper.cuinl" + +using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; +using LayoutFilter = cutlass::layout::TensorKxRSCx<4>; +using LayoutDst = cutlass::layout::TensorNCxHWx<4>; +using ThreadBlockShape = cutlass::gemm::GemmShape<64, 128, 32>; +using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; +using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; +using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< + int8_t, 4, int32_t, int32_t, float>; +using Deconvolution = cutlass::conv::device::Deconvolution< + int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, + LayoutDst, int32_t, LayoutDst, int32_t, + cutlass::arch::OpClassSimt, cutlass::arch::Sm61, + ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, + cutlass::conv::threadblock::ConvolutionDgradNCxHWxThreadblockSwizzle, + 2, 4, 16, true, + cutlass::arch::OpMultiplyAddSaturate>; +template void megdnn::cuda::cutlass_wrapper::cutlass_deconvolution_wrapper( + const typename Deconvolution::ElementSrc* d_src, + const typename Deconvolution::ElementFilter* d_filter, + const typename Deconvolution::ElementBias* d_bias, + const typename Deconvolution::ElementDst* d_z, + typename Deconvolution::ElementDst* d_dst, + int* workspace, + typename Deconvolution::ConvolutionParameter const& conv_param, + typename Deconvolution::EpilogueOutputOp::Params const& epilogue, + cudaStream_t stream); +#pragma GCC diagnostic pop +#endif diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index badbde22..a72d9c8f 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "src/cuda/convolution/opr_impl.h" @@ -25,8 +26,9 @@ using namespace convolution; #define TO_STRING2(v) #v #define TO_STRING(v) TO_STRING2(v) -#define CUDNN_VERSION_STR TO_STRING(CUDNN_MAJOR) "." \ - TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) +#define CUDNN_VERSION_STR \ + TO_STRING(CUDNN_MAJOR) \ + "." TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL) /* ============== ConvolutionForwardImpl ============== */ ConvolutionForwardImpl::Algorithm* @@ -72,25 +74,24 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, } const char* ConvolutionForwardImpl::get_algorithm_set_name() const { - return "CUDA CONVOLUTION_FORWARD" ; + return "CUDA CONVOLUTION_FORWARD"; } /* ============== ConvolutionBackwardDataImpl ============== */ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, - _megdnn_tensor_in diff, - _megdnn_tensor_out grad, - _megdnn_workspace workspace) { + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); - auto algo = get_algorithm(this, filter.layout, - diff.layout, grad.layout); + auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); algo->check_workspace(args, workspace).exec(args); } -std::vector -ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout &filter, - const TensorLayout &diff, - const TensorLayout &grad) { +std::vector +ConvolutionBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, + const TensorLayout& diff, + const TensorLayout& grad) { return megdnn::get_all_algorithms( {this, filter, diff, grad}); } @@ -106,10 +107,10 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( } ConvolutionBackwardDataImpl::Algorithm* -ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, - const CanonizedFilterMeta& filter_meta, const TensorLayout& diff, - const TensorLayout& grad, size_t workspace_limit_in_bytes, - bool reproducible) { +ConvolutionBackwardDataImpl::get_algorithm_heuristic( + const TensorLayout& filter, const CanonizedFilterMeta& filter_meta, + const TensorLayout& diff, const TensorLayout& grad, + size_t workspace_limit_in_bytes, bool reproducible) { AlgoBase::SizeArgs args(this, filter, filter_meta, diff, grad); if (args.filter_meta.group > 1 && @@ -119,6 +120,19 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, return &sm_algo_pack.chanwise; } + if (args.filter_layout->dtype.enumv() == + DTypeTrait::enumv) { + if (reproducible) { + return megdnn::get_reproducible_algo( + sm_algo_pack.int8_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_data"); + } else { + return megdnn::get_usable_algo( + sm_algo_pack.int8_algos, args, workspace_limit_in_bytes, + "cuda conv bwd_data"); + } + } + auto get_cudnn_algo = [this, &args, workspace_limit_in_bytes, reproducible]() -> ConvolutionBackwardDataImpl::AlgoBase* { @@ -206,12 +220,11 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic(const TensorLayout& filter, } size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( - const TensorLayout &filter, - const TensorLayout &diff, - const TensorLayout &grad) { + const TensorLayout& filter, const TensorLayout& diff, + const TensorLayout& grad) { AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, filter, args.filter_meta, diff, grad)-> - get_workspace_in_bytes(args); + return get_algorithm(this, filter, args.filter_meta, diff, grad) + ->get_workspace_in_bytes(args); } const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { @@ -221,19 +234,19 @@ const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { /* ============== ConvolutionBackwardFilterImpl ============== */ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, - _megdnn_tensor_in diff, - _megdnn_tensor_out grad, - _megdnn_workspace workspace) { + _megdnn_tensor_in diff, + _megdnn_tensor_out grad, + _megdnn_workspace workspace) { AlgoBase::ExecArgs args(this, src, diff, grad, workspace); - auto algo = get_algorithm(this, src.layout, diff.layout, - grad.layout, args.grad_filter_meta); + auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout, + args.grad_filter_meta); algo->check_workspace(args, workspace).exec(args); } -std::vector -ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout &src, - const TensorLayout &diff, - const TensorLayout &grad) { +std::vector +ConvolutionBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, + const TensorLayout& diff, + const TensorLayout& grad) { return megdnn::get_all_algorithms( {this, src, diff, grad}); } @@ -269,7 +282,7 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( CUDNNBwdFilterDescs desc; args.init_desc(desc); - //disable, segfault in megbrain, need further investigate. + // disable, segfault in megbrain, need further investigate. #if 0 auto is_heuristic_success = convolution::PerformanceModelBackwardFilter:: @@ -358,12 +371,11 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( } size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( - const TensorLayout &src, - const TensorLayout &diff, - const TensorLayout &grad) { + const TensorLayout& src, const TensorLayout& diff, + const TensorLayout& grad) { AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, grad, args.grad_filter_meta)-> - get_workspace_in_bytes(args); + return get_algorithm(this, src, diff, grad, args.grad_filter_meta) + ->get_workspace_in_bytes(args); } const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const { diff --git a/dnn/src/cuda/convolution/opr_impl.h b/dnn/src/cuda/convolution/opr_impl.h index 1ca8db09..7f9dbeaa 100644 --- a/dnn/src/cuda/convolution/opr_impl.h +++ b/dnn/src/cuda/convolution/opr_impl.h @@ -105,6 +105,7 @@ public: class AlgoChanwiseSmall; class AlgoGroupConvGeneral; class AlgoBFloat16; + class AlgoInt8NCHW4DotProdImplicitGemm; class AlgoPack; diff --git a/dnn/test/common/convolution.cpp b/dnn/test/common/convolution.cpp index 5caebcf8..e3aea119 100644 --- a/dnn/test/common/convolution.cpp +++ b/dnn/test/common/convolution.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "test/common/checker.h" @@ -44,14 +45,12 @@ std::vector convolution::get_args_common() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); } return args; @@ -65,14 +64,12 @@ std::vector convolution::get_args_padding() { param.pad_w = 2; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{5, 2, i, i+1}, - TensorShape{3, 2, 3, 4}); + args.emplace_back(param, TensorShape{5, 2, i, i + 1}, + TensorShape{3, 2, 3, 4}); } return args; @@ -84,14 +81,12 @@ std::vector convolution::get_args_large_channel() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); } for (size_t i = 16; i < 24; ++i) { param::Convolution param; @@ -99,14 +94,12 @@ std::vector convolution::get_args_large_channel() { param.pad_w = 2; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 3, 4}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 3, 4}); } return args; @@ -118,14 +111,12 @@ std::vector convolution::get_args_1x1() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 1, 1}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 1, 1}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 20, i, i+1}, - TensorShape{30, 20, 1, 1}); + args.emplace_back(param, TensorShape{2, 20, i, i + 1}, + TensorShape{30, 20, 1, 1}); } return args; @@ -137,14 +128,12 @@ std::vector convolution::get_args_large_filter() { param::Convolution param; param.mode = param::Convolution::Mode::CONVOLUTION; - args.emplace_back(param, - TensorShape{2, 2, i, i+1}, - TensorShape{3, 2, 7, 8}); + args.emplace_back(param, TensorShape{2, 2, i, i + 1}, + TensorShape{3, 2, 7, 8}); param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{2, 2, i, i+1}, - TensorShape{3, 2, 7, 8}); + args.emplace_back(param, TensorShape{2, 2, i, i + 1}, + TensorShape{3, 2, 7, 8}); } return args; @@ -181,9 +170,8 @@ std::vector convolution::get_args_4x4() { for (size_t oh = 1; oh < 20; ++oh) { param::Convolution param; param.mode = param::Convolution::Mode::CROSS_CORRELATION; - args.emplace_back(param, - TensorShape{4, 3, oh+3, oh+4}, - TensorShape{2, 3, 4, 4}); + args.emplace_back(param, TensorShape{4, 3, oh + 3, oh + 4}, + TensorShape{2, 3, 4, 4}); } return args; @@ -289,26 +277,22 @@ std::vector convolution::get_args_fallback_non_templated_impl() { std::vector convolution::get_args_cudnn_5_1_failures() { std::vector args; args.emplace_back( - param::Convolution{ - param::Convolution::Mode::CROSS_CORRELATION, 0, 4, 1, 2}, - TensorShape{5, 3, 25, 20}, - TensorShape{10, 3, 7, 4} - ); + param::Convolution{param::Convolution::Mode::CROSS_CORRELATION, 0, + 4, 1, 2}, + TensorShape{5, 3, 25, 20}, TensorShape{10, 3, 7, 4}); return args; } std::vector convolution::get_args_x86_winograd_algorithm() { std::vector args; - for (size_t ic_size: {8, 16}) - { + for (size_t ic_size : {8, 16}) { param::Convolution param; param.mode = param::Convolution::Mode::CROSS_CORRELATION; param.stride_h = param.stride_w = 1; param.pad_h = param.pad_w = 0; - args.emplace_back(param, - TensorShape{2, ic_size, 102, 102}, - TensorShape{8, ic_size, 3, 3}); + args.emplace_back(param, TensorShape{2, ic_size, 102, 102}, + TensorShape{8, ic_size, 3, 3}); } return args; @@ -317,18 +301,15 @@ std::vector convolution::get_args_x86_winograd_algorithm() { std::vector convolution::get_args_BRAIN_481() { std::vector args; { - param::Convolution param{param::Convolution::Mode::CROSS_CORRELATION, - 0, 1, 1, 2}; - args.emplace_back(param, - TensorShape{4, 4, 14, 13}, - TensorShape{3, 4, 8, 13}); - for (size_t margin = 0; margin < 5; ++margin) - { - param::Convolution param{param::Convolution::Mode::CROSS_CORRELATION, - 1, 1, 2, 2}; - args.emplace_back(param, - TensorShape{4, 4, 14, 13}, - TensorShape{3, 4, 16-margin, 15-margin}); + param::Convolution param{param::Convolution::Mode::CROSS_CORRELATION, 0, + 1, 1, 2}; + args.emplace_back(param, TensorShape{4, 4, 14, 13}, + TensorShape{3, 4, 8, 13}); + for (size_t margin = 0; margin < 5; ++margin) { + param::Convolution param{ + param::Convolution::Mode::CROSS_CORRELATION, 1, 1, 2, 2}; + args.emplace_back(param, TensorShape{4, 4, 14, 13}, + TensorShape{3, 4, 16 - margin, 15 - margin}); } } @@ -337,7 +318,7 @@ std::vector convolution::get_args_BRAIN_481() { std::vector convolution::get_args() { std::vector all_args, args; -#define ADD_ARGS(NAME) \ +#define ADD_ARGS(NAME) \ args = get_args_##NAME(); \ all_args.insert(all_args.end(), args.begin(), args.end()); ADD_ARGS(common) @@ -356,12 +337,12 @@ std::vector convolution::get_args() { ADD_ARGS(BRAIN_481) #undef ADD_ARGS - return all_args; + return all_args; } std::vector convolution::get_args_cuda_conv_bwd_data() { std::vector all_args, args; -#define ADD_ARGS(NAME) \ +#define ADD_ARGS(NAME) \ args = get_args_##NAME(); \ all_args.insert(all_args.end(), args.begin(), args.end()); ADD_ARGS(common) @@ -378,19 +359,19 @@ std::vector convolution::get_args_cuda_conv_bwd_data() { ADD_ARGS(x86_winograd_algorithm) #undef ADD_ARGS - return all_args; + return all_args; } std::vector convolution::get_args_cudnn_7_5_failures() { std::vector all_args, args; -#define ADD_ARGS(NAME) \ +#define ADD_ARGS(NAME) \ args = get_args_##NAME(); \ all_args.insert(all_args.end(), args.begin(), args.end()); ADD_ARGS(cudnn_5_1_failures) ADD_ARGS(BRAIN_481) #undef ADD_ARGS - return all_args; + return all_args; } std::vector convolution::get_chanwise_args() { std::vector args; @@ -421,12 +402,9 @@ std::vector convolution::get_dilated_args() { param::Convolution param; param.pad_h = param.pad_w = 2; param.dilate_h = param.dilate_w = 2; - size_t n = 1, ic = 15, ih = 128, iw = 128, - fh = 3, fw = 3, - oc = 17; - args.emplace_back(param, - TensorShape{n, ic, ih, iw}, - TensorShape{oc, ic, fh, fw}); + size_t n = 1, ic = 15, ih = 128, iw = 128, fh = 3, fw = 3, oc = 17; + args.emplace_back(param, TensorShape{n, ic, ih, iw}, + TensorShape{oc, ic, fh, fw}); // exhaustive search // clang-format off for (size_t n: {2}) @@ -451,13 +429,44 @@ std::vector convolution::get_dilated_args() { return args; } -void convolution::test_conv_config_combinations(int k_size, - Handle* handle, bool test_int8, - bool test_backward, - bool is_cuda, - ConvEPSGetter eps_getter, - bool use_io16xc32) { - Checker checker(handle); +std::vector convolution::get_args_int8_nchw4_conv_bwd_data() { + std::vector args; + param::Convolution cur_param; + + // clang-format off + for (auto mode : {param::ConvBias::Mode::CROSS_CORRELATION}) { + for (size_t b : {64, 16}) { + for (size_t ic : {16, 32}) { + for (size_t oc : {16, 32}) { + for (size_t h : {8}) { + for (size_t w : {8, 11}) { + for (size_t kernel_size : {3, 4, 5, 7}) { + for (int p : {0, static_cast(kernel_size / 2)}) { + for (size_t s : {2}) { + if (kernel_size >= 7) { + b = std::min(b, 32_z); + } + size_t f = kernel_size; + cur_param.mode = mode; + + cur_param.format = param::ConvBias::Format::NCHW4; + cur_param.sparse = param::ConvBias::Sparse::DENSE; + cur_param.pad_h = cur_param.pad_w = p; + cur_param.stride_h = cur_param.stride_w = s; + + //! bias channel + args.emplace_back(cur_param, TensorShape{b, ic / 4, h, w, 4}, + TensorShape{oc, ic / 4, f, f, 4}); + } } } } } } } } } + // clang-format on + + return args; +} + +void convolution::test_conv_config_combinations( + int k_size, Handle* handle, bool test_int8, bool test_backward, + bool is_cuda, ConvEPSGetter eps_getter, bool use_io16xc32) { +Checker checker(handle); std::unique_ptr> checker_bwd_data_ptr; std::unique_ptr> checker_bwd_filter_ptr; if (test_backward) { @@ -657,7 +666,6 @@ void convolution::test_conv_config_combinations(int k_size, } } } - } // vim: syntax=cpp.doxygen diff --git a/dnn/test/common/convolution.h b/dnn/test/common/convolution.h index 2d9a338d..99b3a8f9 100644 --- a/dnn/test/common/convolution.h +++ b/dnn/test/common/convolution.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -47,6 +48,7 @@ std::vector get_args_cudnn_7_5_failures(); std::vector get_1x1_args(); std::vector get_dilated_args(); std::vector get_chanwise_args(); +std::vector get_args_int8_nchw4_conv_bwd_data(); //! \param stage 0 for fwd, 1 for bwd data, 2 for bwd filter using ConvEPSGetter = diff --git a/dnn/test/cuda/convolution.cpp b/dnn/test/cuda/convolution.cpp index 8dd7f4b8..5d6a2a6f 100644 --- a/dnn/test/cuda/convolution.cpp +++ b/dnn/test/cuda/convolution.cpp @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #include "megdnn/dtype.h" #include "megdnn/oprs.h" @@ -18,7 +19,6 @@ #include "test/common/convolution.h" #include "test/common/rng.h" #include "test/cuda/benchmark.h" - #include "src/cuda/utils.h" #define V1(x) #x @@ -29,8 +29,7 @@ namespace megdnn { namespace test { -TEST_F(CUDA, CONVOLUTION_8X8X32) -{ +TEST_F(CUDA, CONVOLUTION_8X8X32) { if (!cuda::is_compute_capability_required(6, 1)) { printf("Skip CUDA.CONVOLUTION_8X8X32 test as current device" "doesn't support\n"); @@ -41,66 +40,63 @@ TEST_F(CUDA, CONVOLUTION_8X8X32) std::vector args; { auto v = get_args(); - for (auto &&a: v) { + for (auto&& a : v) { args.push_back(std::move(a)); } } { auto v = get_dilated_args(); - for (auto &&a: v) { + for (auto&& a : v) { args.push_back(std::move(a)); } } { auto v = get_chanwise_args(); - for (auto &&a: v) { + for (auto&& a : v) { args.push_back(std::move(a)); } } Checker checker(handle_cuda()); UniformIntRNG rng(-4, 4); - for (auto arg: args) { + for (auto arg : args) { arg.param.format = param::Convolution::Format::NHWC; arg.src = cvt_src_or_dst_nchw2nhwc(arg.src); arg.filter = cvt_filter_nchw2nhwc(arg.filter); - checker.set_dtype(0, dtype::Int8()). - set_dtype(1, dtype::Int8()). - set_dtype(2, dtype::Int32()). - set_param(arg.param). - set_rng(0, &rng). - set_rng(1, &rng). - execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Int8()) + .set_dtype(1, dtype::Int8()) + .set_dtype(2, dtype::Int32()) + .set_param(arg.param) + .set_rng(0, &rng) + .set_rng(1, &rng) + .execs({arg.src, arg.filter, {}}); } } -TEST_F(CUDA, CONVOLUTION_FORWARD) -{ +TEST_F(CUDA, CONVOLUTION_FORWARD) { using namespace convolution; std::vector args = get_args(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { + for (auto&& arg : args) { float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); - checker. - set_dtype(0, dtype::Float32()). - set_dtype(1, dtype::Float32()). - set_dtype(2, dtype::Float32()). - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_epsilon(1e-3). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); - checker. - set_dtype(0, dtype::Float16()). - set_dtype(1, dtype::Float16()). - set_dtype(2, dtype::Float16()). - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Float16()) + .set_dtype(1, dtype::Float16()) + .set_dtype(2, dtype::Float16()) + .set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_dtype(0, dtype::Float16()) .set_dtype(1, dtype::Float16()) @@ -152,51 +148,49 @@ TEST_F(CUDA, CONV_FORWARD_MATMUL_NCHW4) { checker.exec({{8, 64, 12, 12, 4}, {256, 64, 3, 3, 4}, {}}); } -TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) -{ +TEST_F(CUDA, CONVOLUTION_1X1_FORWARD) { using namespace convolution; std::vector args = get_1x1_args(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { - float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); + for (auto&& arg : args) { + float scale = + 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); - checker. - set_dtype(0, dtype::Float32()). - set_dtype(1, dtype::Float32()). - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_epsilon(1e-3). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); + checker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); } } -TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) -{ +TEST_F(CUDA, BENCHMARK_CONVOLUTION_1X1_FORWARD) { using namespace convolution; std::vector args = get_1x1_args(); Benchmarker marker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { - float scale = 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); + for (auto&& arg : args) { + float scale = + 1.0f / sqrt(arg.filter[1] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); - marker.set_dtype(0, dtype::Float32()). - set_dtype(1, dtype::Float32()). - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_param(arg.param). - execs({arg.src, arg.filter, {}}); + marker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_param(arg.param) + .execs({arg.src, arg.filter, {}}); } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) { using namespace convolution; std::vector args = get_args_cuda_conv_bwd_data(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { + for (auto&& arg : args) { float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); @@ -243,8 +237,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA) } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) { using namespace convolution; std::vector args = get_args_cuda_conv_bwd_data(); Checker checker(handle_cuda()); @@ -252,7 +245,7 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) checker.set_before_exec_callback(AlgoChecker( ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); NormalRNG default_rng; - for (auto &&arg: args) { + for (auto&& arg : args) { float scale = 64.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); UniformFloatRNG rng(scale, 2 * scale); @@ -273,9 +266,39 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_MATMUL) } } +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_INT8_DP4A) { + if (!cuda::is_compute_capability_required(6, 1)) { + printf("Skip CUDA.CONVOLUTION_BACKWARD_DATA_INT8_DP4A test as current " + "device doesn't support\n"); + return; + } + + using namespace convolution; + std::vector args = get_args_int8_nchw4_conv_bwd_data(); + Checker checker(handle_cuda()); + + checker.set_before_exec_callback(AlgoChecker( + "INT8_NCHW4_DOTPROD_IMPLICIT_GEMM")); + + checker.set_epsilon(1 + 1e-3).set_max_avg_error(1e-1); -TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) -{ + for (auto&& arg : args) { + UniformIntRNG rng(-3, 3); + auto src = TensorLayout(arg.src, dtype::QuantizedS8{1.2f}); + auto filter = TensorLayout(arg.filter, dtype::QuantizedS8{1.3f}); + TensorLayout dst; + dst.dtype = dtype::QuantizedS8{1.2f}; + { + auto opr = handle_cuda()->create_operator(); + opr->param() = arg.param; + opr->deduce_layout(src, filter, dst); + } + checker.set_rng(0, &rng).set_rng(1, &rng).set_param(arg.param).exec( + TensorLayoutArray{filter, dst, src}); + } +} + +TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) { // BRAIN-481 failed on architectures 7.0, remove the following if statement, // when cudnn fixed the problem. if (cuda::is_compute_capability_required(7, 0)) @@ -284,8 +307,9 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) std::vector args = get_args_cudnn_7_5_failures(); Checker checker(handle_cuda()); NormalRNG default_rng; - for (auto &&arg: args) { - float scale = 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); + for (auto&& arg : args) { + float scale = + 128.f / sqrt(arg.filter[0] * arg.filter[2] * arg.filter[3]); scale = std::max(scale, 1.f); UniformFloatRNG rng(scale, 2 * scale); auto src = TensorLayout(arg.src, dtype::Float32()); @@ -297,19 +321,17 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) opr->deduce_layout(src, filter, dst); } src.dtype = dst.dtype = filter.dtype = dtype::Float32(); - checker. - set_rng(0, &default_rng). - set_rng(1, &default_rng). - set_epsilon(1e-3). - set_param(arg.param). - exec(TensorLayoutArray{filter, dst, src}); + checker.set_rng(0, &default_rng) + .set_rng(1, &default_rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); src.dtype = dst.dtype = filter.dtype = dtype::Float16(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - exec(TensorLayoutArray{filter, dst, src}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .exec(TensorLayoutArray{filter, dst, src}); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_rng(0, &rng) .set_rng(1, &rng) @@ -319,13 +341,12 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_DATA_FAILED_CUDNN7_5) } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) { using namespace convolution; std::vector args = get_args(); Checker checker(handle_cuda()); bool f16_checked = false; - for (auto &&arg: args) { + for (auto&& arg : args) { auto src = TensorLayout(arg.src, dtype::Float32()); auto filter = TensorLayout(arg.filter, dtype::Float32()); TensorLayout dst; @@ -337,12 +358,11 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) float scale = 1.0f / sqrt(dst[2] * dst[3]); UniformFloatRNG rng(scale, 2 * scale); src.dtype = dst.dtype = filter.dtype = dtype::Float32(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-3). - set_param(arg.param). - exec(TensorLayoutArray{src, dst, filter}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{src, dst, filter}); // reduce on large f16 array may introduce significant error if (dst.total_nr_elems() >= 1000 && f16_checked) @@ -350,12 +370,11 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) f16_checked = true; src.dtype = dst.dtype = filter.dtype = dtype::Float16(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-1). - set_param(arg.param). - exec(TensorLayoutArray{src, dst, filter}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-1) + .set_param(arg.param) + .exec(TensorLayoutArray{src, dst, filter}); arg.param.compute_mode = param::Convolution::ComputeMode::FLOAT32; checker.set_rng(0, &rng) .set_rng(1, &rng) @@ -377,14 +396,13 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER) } } -TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) -{ +TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) { using namespace convolution; std::vector args = get_args(); Checker checker(handle_cuda()); checker.set_before_exec_callback(AlgoChecker( ExecutionPolicyAlgoName{"MATMUL", {{"CUBLAS", {}}}})); - for (auto &&arg: args) { + for (auto&& arg : args) { auto src = TensorLayout(arg.src, dtype::Float32()); auto filter = TensorLayout(arg.filter, dtype::Float32()); TensorLayout dst; @@ -396,17 +414,16 @@ TEST_F(CUDA, CONVOLUTION_BACKWARD_FILTER_MATMUL) float scale = 1.0f / sqrt(dst[2] * dst[3]); UniformFloatRNG rng(scale, 2 * scale); src.dtype = dst.dtype = filter.dtype = dtype::Float32(); - checker. - set_rng(0, &rng). - set_rng(1, &rng). - set_epsilon(1e-3). - set_param(arg.param). - exec(TensorLayoutArray{src, dst, filter}); + checker.set_rng(0, &rng) + .set_rng(1, &rng) + .set_epsilon(1e-3) + .set_param(arg.param) + .exec(TensorLayoutArray{src, dst, filter}); } } TEST_F(CUDA, CONV_CONFIG_COMBINATIONS) { - auto eps_getter = [](bool f16, int stage, const char *name) -> float { + auto eps_getter = [](bool f16, int stage, const char* name) -> float { if (f16) { return stage == 2 ? 0.5 : 0.2; } @@ -687,6 +704,46 @@ TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_BF16) { run(32, 64, 64, 56, 56, 1, 1, 0); } +TEST_F(CUDA, BENCHMARK_CONVOLUTION_BWD_DATA_INT8_DP4A) { + CUBenchmarker bench{handle_cuda()}; + std::unique_ptr> proxy{ + new OprProxy{true}}; + size_t RUNS = 10; + bench.set_proxy(proxy).set_times(RUNS); + + auto run = [&](size_t N, size_t OC, size_t IC, size_t IH, size_t IW, + size_t FH, size_t SH, size_t PH) { + bench.set_dtype(0, dtype::QuantizedS8{1.0f}) + .set_dtype(1, dtype::QuantizedS8{1.0f}) + .set_dtype(2, dtype::QuantizedS8{1.0f}); + param::Convolution param; + param.format = param::Convolution::Format::NCHW4; + param.stride_h = param.stride_w = SH; + param.pad_h = param.pad_w = PH; + param.compute_mode = param::Convolution::ComputeMode::DEFAULT; + bench.set_param(param); + bench.proxy()->target_execution_policy = {}; + TensorLayout src{{N, IC / 4, IH, IW, 4}, dtype::QuantizedS8{1.0f}}, + filter{{OC, IC / 4, FH, FH, 4}, dtype::QuantizedS8{1.0f}}; + TensorLayout dst; + dst.dtype = dtype::QuantizedS8{1.0f}; + { + auto&& opr = handle_cuda()->create_operator(); + opr->param() = param; + opr->deduce_layout(src, filter, dst); + } + auto used = bench.execl({filter, dst, src}) / RUNS; + float flo = 2.0 * N * OC * IC * dst[2] * dst[3] * FH * FH; + printf("inp=%s, kern=%s, dst=%s ", src.to_string().c_str(), + filter.to_string().c_str(), dst.to_string().c_str()); + printf("time_fp32=%.2fms, flops=%.3fTFLOPS\n", used, + (flo / (used * 1e9))); + }; + run(64, 32, 32, 92, 180, 4, 2, 2); + run(64, 32, 32, 46, 80, 4, 2, 2); + run(16, 16, 16, 92, 180, 4, 2, 2); + run(16, 16, 16, 46, 80, 4, 2, 2); +} TEST_F(CUDA, CONVOLUTION_BWD_FILTER_BENCHMARK) { CUBenchmarker bench{handle_cuda()}; diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 9e779f27..89972604 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -598,6 +598,51 @@ TEST(TestOprDNN, Deconvolution) { run({TensorShape{4, 6, 7, 2}, {2, 3, 4, 8, 13}}, opt); } +TEST(TestOprDNN, DeconvolutionExePolicy_QuantizedS8) { + REQUIRE_GPU(1); + auto cn = CompNode::load("gpu0"); + cn.activate(); + REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1); + + Param param; + using Policy = opr::ConvolutionBackwardData::ExecutionPolicy; + using S = Policy::Strategy; + +#if MGB_ENABLE_FASTRUN + for (auto strategy : {S::PROFILE, S::HEURISTIC, S::PROFILE_REPRODUCIBLE, + S::PROFILE_HEURISTIC}) { +#else + for (auto strategy : {S : HEURISTIC, S::PROFILE_HEURISTIC}) { +#endif + auto graph = ComputingGraph::make(); + HostTensorGenerator<> gen; + + auto mkvar = [&](const char* name, const TensorShape& shp, + const DType& dtype) { + return opr::TypeCvt::make( + opr::Host2DeviceCopy::make(*graph, gen(shp)).rename(name), + dtype); + }; + + auto x = mkvar("x", {16, 4, 50, 50, 4}, dtype::QuantizedS8(1.2f)); + auto w = mkvar("w", {16, 4, 4, 4, 4}, dtype::QuantizedS8(1.3f)); + + param.format = Param::Format::NCHW4; + param.pad_h = param.pad_w = 2; + param.stride_h = param.stride_w = 2; + + Policy policy; + policy.strategy = strategy; + + auto deconv = opr::ConvolutionBackwardData::make_deconv( + x, w, param, policy, + OperatorNodeConfig{dtype::QuantizedS8(1.2f)}); + HostTensorND host_y; + auto func = graph->compile({make_callback_copy(deconv, host_y)}); + func->execute(); + } +} + TEST(TestOprDNN, ConvolutionBackwardFilter) { using Checker = AutoOprChecker<3, 1>;