GitOrigin-RevId: 49e0565e8a
tags/v1.3.0
@@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | |||||
dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | ||||
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | ||||
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | ||||
dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | |||||
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | ||||
*.caffemodel filter=lfs diff=lfs merge=lfs -text | *.caffemodel filter=lfs diff=lfs merge=lfs -text | ||||
imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | ||||
@@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc( | |||||
size_t src_ndim, const TensorLayout& filter, const Param& param, | size_t src_ndim, const TensorLayout& filter, const Param& param, | ||||
typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | ||||
megdnn_assert(param.format == Param::Format::NCHW || | megdnn_assert(param.format == Param::Format::NCHW || | ||||
param.format == Param::Format::NHWC ); | |||||
param.format == Param::Format::NHWC); | |||||
auto img_ndim = src_ndim - 2; | auto img_ndim = src_ndim - 2; | ||||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | ||||
if (param.sparse == Param::Sparse::DENSE) { | if (param.sparse == Param::Sparse::DENSE) { | ||||
@@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx( | |||||
img_ndim, filter.ndim); | img_ndim, filter.ndim); | ||||
megdnn_assert((filter[filter.ndim - 1] == pack_size && | megdnn_assert((filter[filter.ndim - 1] == pack_size && | ||||
filter[filter.ndim - 2] == pack_size) || | filter[filter.ndim - 2] == pack_size) || | ||||
(filter[filter.ndim - 1] == 2 * pack_size && | |||||
filter[filter.ndim - 2] == 2 * pack_size), | |||||
(filter[filter.ndim - 1] == 2 * pack_size && | |||||
filter[filter.ndim - 2] == 2 * pack_size), | |||||
"last 2 dim of filter must be %zu, but got %zu, %zu", | "last 2 dim of filter must be %zu, but got %zu, %zu", | ||||
pack_size, filter[filter.ndim - 2], | pack_size, filter[filter.ndim - 2], | ||||
filter[filter.ndim - 1]); | filter[filter.ndim - 1]); | ||||
@@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
} | } | ||||
if (param().format == Param::Format::NCHW44 || | if (param().format == Param::Format::NCHW44 || | ||||
param().format == Param::Format::NCHW44_DOT) { | param().format == Param::Format::NCHW44_DOT) { | ||||
//!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul | |||||
//! support nchw44 filter change to 88 for int8 winogradf23_88 using | |||||
//! MK8 mamtul | |||||
megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | ||||
filter[filter.ndim - 1] == 4) || | filter[filter.ndim - 1] == 4) || | ||||
(src.ndim == 5 && | (src.ndim == 5 && | ||||
@@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
"currently only convolution on 2D image is supported"); | "currently only convolution on 2D image is supported"); | ||||
auto cflt = make_canonized_filter_meta(src.ndim, filter); | auto cflt = make_canonized_filter_meta(src.ndim, filter); | ||||
if (param().format == Param::Format::NCHW || | if (param().format == Param::Format::NCHW || | ||||
param().format == Param::Format::NHWC ) { | |||||
param().format == Param::Format::NHWC) { | |||||
size_t src_or_dst_c_pos = 0; | size_t src_or_dst_c_pos = 0; | ||||
size_t src_or_dst_spatial_start = 0; | size_t src_or_dst_spatial_start = 0; | ||||
if (param().format == Param::Format::NCHW) { | if (param().format == Param::Format::NCHW) { | ||||
@@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | ||||
cflt.stride[1], cflt.padding[1]); | cflt.stride[1], cflt.padding[1]); | ||||
dst[4] = 32; | dst[4] = 32; | ||||
} else if (param().format == Param::Format::NCHW88 ) { | |||||
} else if (param().format == Param::Format::NCHW88) { | |||||
megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | ||||
"invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | ||||
src.ndim); | src.ndim); | ||||
@@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, | |||||
} | } | ||||
megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | ||||
#if !MEGDNN_DISABLE_FLOAT16 | #if !MEGDNN_DISABLE_FLOAT16 | ||||
|| filter.enumv() == DTypeEnum::Float16 | |||||
|| filter.enumv() == DTypeEnum::BFloat16 | |||||
|| filter.enumv() == DTypeEnum::Float16 || | |||||
filter.enumv() == DTypeEnum::BFloat16 | |||||
#endif | #endif | ||||
, | |||||
, | |||||
"ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | "ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | ||||
"input / output."); | "input / output."); | ||||
} | } | ||||
@@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, | |||||
diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], | diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], | ||||
cflt.stride[i], cflt.padding[i]); | cflt.stride[i], cflt.padding[i]); | ||||
} | } | ||||
} else if (param().format == Param::Format::NCHW4) { | |||||
megdnn_assert(diff.ndim == 5, | |||||
"valid diff ndim for NCHW4, expected=5, got=%zu", | |||||
diff.ndim); | |||||
megdnn_assert(cflt.group == 1, "%s", errmsg().c_str()); | |||||
megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s", | |||||
errmsg().c_str()); | |||||
grad.ndim = diff.ndim; | |||||
grad[0] = diff[0]; | |||||
auto ic = cflt.icpg * cflt.group; | |||||
megdnn_assert(ic % 4 == 0); | |||||
grad[1] = ic / 4; | |||||
grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0], | |||||
cflt.padding[0]); | |||||
grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], | |||||
cflt.padding[1]); | |||||
megdnn_assert(diff[4] == 4); | |||||
grad[4] = 4; | |||||
} else { | } else { | ||||
megdnn_assert(param().format == Param::Format::NHWCD4); | megdnn_assert(param().format == Param::Format::NHWCD4); | ||||
megdnn_assert(diff.ndim == 5, | megdnn_assert(diff.ndim == 5, | ||||
@@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | ||||
using Convolution = cutlass::convolution::device::Convolution< \ | |||||
using Convolution = cutlass::conv::device::Convolution< \ | |||||
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | ||||
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | ||||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
cutlass::convolution::ConvType::kConvolution, \ | |||||
cutlass::conv::ConvType::kConvolution, \ | |||||
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
cutlass::convolution::threadblock:: \ | |||||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
cutlass::convolution::ConvType::kConvolution>, \ | |||||
cutlass::conv::threadblock:: \ | |||||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
2, 16, 16, NeedLoadFromConstMem>; \ | 2, 16, 16, NeedLoadFromConstMem>; \ | ||||
typename Convolution::ConvolutionParameter conv_param{ \ | |||||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
param.sw, param.ph, param.pw, 1, 1}; \ | |||||
typename Convolution::ConvolutionParameter conv_param( \ | |||||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
epilogue, stream); \ | epilogue, stream); \ | ||||
@@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | ||||
using Convolution = cutlass::convolution::device::Convolution< \ | |||||
using Convolution = cutlass::conv::device::Convolution< \ | |||||
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | ||||
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | ||||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
cutlass::convolution::ConvType::kConvolution, \ | |||||
cutlass::conv::ConvType::kConvolution, \ | |||||
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
cutlass::convolution::threadblock:: \ | |||||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
cutlass::convolution::ConvType::kConvolution>, \ | |||||
cutlass::conv::threadblock:: \ | |||||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
2, 16, 16, NeedLoadFromConstMem>; \ | 2, 16, 16, NeedLoadFromConstMem>; \ | ||||
typename Convolution::ConvolutionParameter conv_param{ \ | |||||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
param.sw, param.ph, param.pw, 1, 1}; \ | |||||
typename Convolution::ConvolutionParameter conv_param( \ | |||||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
epilogue, stream); \ | epilogue, stream); \ | ||||
@@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | ||||
using Convolution = cutlass::convolution::device::Convolution< \ | |||||
using Convolution = cutlass::conv::device::Convolution< \ | |||||
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | ||||
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | ||||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | cutlass::layout::TensorNCxHWx<4>, int32_t, \ | ||||
cutlass::convolution::ConvType::kConvolution, \ | |||||
cutlass::conv::ConvType::kConvolution, \ | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
cutlass::convolution::threadblock:: \ | |||||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
cutlass::convolution::ConvType::kConvolution>, \ | |||||
cutlass::conv::threadblock:: \ | |||||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
stage_, 4, aligned_, NeedLoadFromConstMem>; \ | stage_, 4, aligned_, NeedLoadFromConstMem>; \ | ||||
typename Convolution::ConvolutionParameter conv_param{ \ | |||||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
param.sw, param.ph, param.pw, 1, 1}; \ | |||||
typename Convolution::ConvolutionParameter conv_param( \ | |||||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
epilogue, stream); \ | epilogue, stream); \ | ||||
@@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | ||||
using Convolution = cutlass::convolution::device::Convolution< \ | |||||
using Convolution = cutlass::conv::device::Convolution< \ | |||||
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | ||||
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | ||||
cutlass::layout::TensorNCHW, float, \ | cutlass::layout::TensorNCHW, float, \ | ||||
cutlass::layout::TensorNCHW, int32_t, \ | cutlass::layout::TensorNCHW, int32_t, \ | ||||
cutlass::convolution::ConvType::kConvolution, \ | |||||
cutlass::conv::ConvType::kConvolution, \ | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
cutlass::convolution::threadblock:: \ | |||||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
cutlass::convolution::ConvType::kConvolution>, \ | |||||
cutlass::conv::threadblock:: \ | |||||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
stages_, 4, aligned_, NeedLoadFromConstMem, \ | stages_, 4, aligned_, NeedLoadFromConstMem, \ | ||||
cutlass::arch::OpMultiplyAdd>; \ | cutlass::arch::OpMultiplyAdd>; \ | ||||
typename Convolution::ConvolutionParameter conv_param{ \ | |||||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
param.sw, param.ph, param.pw, 1, 1}; \ | |||||
typename Convolution::ConvolutionParameter conv_param( \ | |||||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
epilogue, stream); \ | epilogue, stream); \ | ||||
@@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | ||||
using Convolution = cutlass::convolution::device::Convolution< \ | |||||
using Convolution = cutlass::conv::device::Convolution< \ | |||||
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | ||||
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | ||||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | cutlass::layout::TensorNCxHWx<32>, int32_t, \ | ||||
cutlass::convolution::ConvType::kConvolution, \ | |||||
cutlass::conv::ConvType::kConvolution, \ | |||||
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | ||||
cutlass::convolution::threadblock:: \ | |||||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||||
cutlass::convolution::ConvType::kConvolution>, \ | |||||
cutlass::conv::threadblock:: \ | |||||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||||
stages_, 4, aligned_, NeedLoadFromConstMem>; \ | stages_, 4, aligned_, NeedLoadFromConstMem>; \ | ||||
typename Convolution::ConvolutionParameter conv_param{ \ | |||||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||||
param.sw, param.ph, param.pw, 1, 1}; \ | |||||
typename Convolution::ConvolutionParameter conv_param( \ | |||||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||||
return cutlass_convolution_wrapper<Convolution>( \ | return cutlass_convolution_wrapper<Convolution>( \ | ||||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | ||||
epilogue, stream); \ | epilogue, stream); \ | ||||
} | } | ||||
#define DISPATCH_KERNEL \ | #define DISPATCH_KERNEL \ | ||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||||
megdnn_assert(false, \ | megdnn_assert(false, \ | ||||
"unsupported threadblock shape (%dx%dx%d) and warp shape " \ | "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | ||||
"(%dx%dx%d)", \ | "(%dx%dx%d)", \ | ||||
@@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( | |||||
cudaStream_t stream) { | cudaStream_t stream) { | ||||
typename Convolution::TensorRefSrc tensor_src{ | typename Convolution::TensorRefSrc tensor_src{ | ||||
const_cast<typename Convolution::ElementSrc*>(d_src), | const_cast<typename Convolution::ElementSrc*>(d_src), | ||||
Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), | |||||
conv_param.wi(), conv_param.ci()})}; | |||||
Convolution::LayoutSrc::packed( | |||||
{conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; | |||||
typename Convolution::TensorRefFilter tensor_filter{ | typename Convolution::TensorRefFilter tensor_filter{ | ||||
const_cast<typename Convolution::ElementFilter*>(d_filter), | const_cast<typename Convolution::ElementFilter*>(d_filter), | ||||
Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), | |||||
conv_param.fw(), | |||||
conv_param.ci()})}; | |||||
Convolution::LayoutFilter::packed( | |||||
{conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; | |||||
typename Convolution::TensorRefBias tensor_bias{ | typename Convolution::TensorRefBias tensor_bias{ | ||||
const_cast<typename Convolution::ElementBias*>(d_bias), | const_cast<typename Convolution::ElementBias*>(d_bias), | ||||
Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; | |||||
Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; | |||||
typename Convolution::TensorRefDst tensor_z{ | typename Convolution::TensorRefDst tensor_z{ | ||||
const_cast<typename Convolution::ElementDst*>(d_z), | const_cast<typename Convolution::ElementDst*>(d_z), | ||||
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||||
conv_param.wo(), conv_param.co()})}; | |||||
Convolution::LayoutDst::packed( | |||||
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||||
typename Convolution::TensorRefDst tensor_dst{ | typename Convolution::TensorRefDst tensor_dst{ | ||||
d_dst, | d_dst, | ||||
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||||
conv_param.wo(), conv_param.co()})}; | |||||
typename Convolution::Arguments arguments{ | |||||
conv_param, tensor_src, tensor_filter, | |||||
tensor_bias, tensor_z, tensor_dst.non_const_ref(), | |||||
epilogue}; | |||||
Convolution::LayoutDst::packed( | |||||
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||||
typename Convolution::Arguments arguments{conv_param, | |||||
tensor_src.non_const_ref(), | |||||
tensor_filter.non_const_ref(), | |||||
tensor_bias.non_const_ref(), | |||||
tensor_z.non_const_ref(), | |||||
tensor_dst.non_const_ref(), | |||||
epilogue}; | |||||
Convolution conv_op; | Convolution conv_op; | ||||
cutlass_check(conv_op.initialize(arguments, workspace)); | cutlass_check(conv_op.initialize(arguments, workspace)); | ||||
cutlass_check(conv_op(stream)); | cutlass_check(conv_op(stream)); | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, true, | 1, 4, 8, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, true, | 1, 4, 8, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, true, | 1, 4, 8, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, true, | 2, 4, 4, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, true, | 2, 4, 4, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, true, | 2, 4, 4, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, false, | 1, 4, 8, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, false, | 1, 4, 8, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
1, 4, 8, false, | 1, 4, 8, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, false, | 2, 4, 4, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, false, | 2, 4, 4, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 4, false, | 2, 4, 4, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, true, | 2, 4, 16, true, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | ||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
using Convolution = cutlass::conv::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutDst, int32_t, LayoutDst, int32_t, | LayoutDst, int32_t, LayoutDst, int32_t, | ||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||||
2, 4, 16, false, | 2, 4, 16, false, | ||||
cutlass::arch::OpMultiplyAddSaturate>; | cutlass::arch::OpMultiplyAddSaturate>; | ||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||