GitOrigin-RevId: 49e0565e8a
tags/v1.3.0
@@ -5,6 +5,7 @@ dnn/src/cuda/conv_bias/int8_imma/kimpl/* binary | |||
dnn/src/cuda/batch_conv_bias/int8/kimpl/* binary | |||
dnn/src/cuda/matrix_mul/fp32_simt/kimpl/* binary | |||
dnn/src/cuda/sass/prebuilt/map_defs.cpp binary | |||
dnn/src/cuda/convolution/backward_data/int8/kimpl/* binary | |||
tools/mlir/mlir-tblgen filter=lfs diff=lfs merge=lfs -text | |||
*.caffemodel filter=lfs diff=lfs merge=lfs -text | |||
imperative/python/test/integration/data/*.mge filter=lfs diff=lfs merge=lfs -text | |||
@@ -46,7 +46,7 @@ void make_canonized_filter_meta_nchw_nhwc( | |||
size_t src_ndim, const TensorLayout& filter, const Param& param, | |||
typename ConvolutionBase<Parameter>::CanonizedFilterMeta& ret) { | |||
megdnn_assert(param.format == Param::Format::NCHW || | |||
param.format == Param::Format::NHWC ); | |||
param.format == Param::Format::NHWC); | |||
auto img_ndim = src_ndim - 2; | |||
size_t flt_start, flt_spatial_start, ocpg_pos, icpg_pos; | |||
if (param.sparse == Param::Sparse::DENSE) { | |||
@@ -320,8 +320,8 @@ void make_canonized_filter_meta_nchwxx( | |||
img_ndim, filter.ndim); | |||
megdnn_assert((filter[filter.ndim - 1] == pack_size && | |||
filter[filter.ndim - 2] == pack_size) || | |||
(filter[filter.ndim - 1] == 2 * pack_size && | |||
filter[filter.ndim - 2] == 2 * pack_size), | |||
(filter[filter.ndim - 1] == 2 * pack_size && | |||
filter[filter.ndim - 2] == 2 * pack_size), | |||
"last 2 dim of filter must be %zu, but got %zu, %zu", | |||
pack_size, filter[filter.ndim - 2], | |||
filter[filter.ndim - 1]); | |||
@@ -684,7 +684,8 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
} | |||
if (param().format == Param::Format::NCHW44 || | |||
param().format == Param::Format::NCHW44_DOT) { | |||
//!support nchw44 filter change to 88 for int8 winogradf23_88 using MK8 mamtul | |||
//! support nchw44 filter change to 88 for int8 winogradf23_88 using | |||
//! MK8 mamtul | |||
megdnn_assert((src.ndim == 4 && filter.ndim == 5 && | |||
filter[filter.ndim - 1] == 4) || | |||
(src.ndim == 5 && | |||
@@ -716,7 +717,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
"currently only convolution on 2D image is supported"); | |||
auto cflt = make_canonized_filter_meta(src.ndim, filter); | |||
if (param().format == Param::Format::NCHW || | |||
param().format == Param::Format::NHWC ) { | |||
param().format == Param::Format::NHWC) { | |||
size_t src_or_dst_c_pos = 0; | |||
size_t src_or_dst_spatial_start = 0; | |||
if (param().format == Param::Format::NCHW) { | |||
@@ -790,7 +791,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
dst[3] = infer_conv_shape(src[3], cflt.dilated_spatial[1], | |||
cflt.stride[1], cflt.padding[1]); | |||
dst[4] = 32; | |||
} else if (param().format == Param::Format::NCHW88 ) { | |||
} else if (param().format == Param::Format::NCHW88) { | |||
megdnn_assert(src.ndim == 5 || (src.ndim == 4 && src[1] <= 8), | |||
"invalid src ndim for NCHW88, expected=5 or 4, got=%zu", | |||
src.ndim); | |||
@@ -1042,10 +1043,10 @@ void ConvolutionBackwardData::deduce_dtype(DType filter, DType diff, | |||
} | |||
megdnn_assert(param().compute_mode != Param::ComputeMode::FLOAT32 | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
|| filter.enumv() == DTypeEnum::Float16 | |||
|| filter.enumv() == DTypeEnum::BFloat16 | |||
|| filter.enumv() == DTypeEnum::Float16 || | |||
filter.enumv() == DTypeEnum::BFloat16 | |||
#endif | |||
, | |||
, | |||
"ComputeMode::FLOAT32 is only available for Float16/BFloat16 " | |||
"input / output."); | |||
} | |||
@@ -1096,6 +1097,24 @@ void ConvolutionBackwardData::deduce_layout(const TensorLayout& filter, | |||
diff[i + src_or_dst_spatial_start], cflt.dilated_spatial[i], | |||
cflt.stride[i], cflt.padding[i]); | |||
} | |||
} else if (param().format == Param::Format::NCHW4) { | |||
megdnn_assert(diff.ndim == 5, | |||
"valid diff ndim for NCHW4, expected=5, got=%zu", | |||
diff.ndim); | |||
megdnn_assert(cflt.group == 1, "%s", errmsg().c_str()); | |||
megdnn_assert(cflt.ocpg * cflt.group == diff[1] * 4, "%s", | |||
errmsg().c_str()); | |||
grad.ndim = diff.ndim; | |||
grad[0] = diff[0]; | |||
auto ic = cflt.icpg * cflt.group; | |||
megdnn_assert(ic % 4 == 0); | |||
grad[1] = ic / 4; | |||
grad[2] = deduce(diff[2], cflt.dilated_spatial[0], cflt.stride[0], | |||
cflt.padding[0]); | |||
grad[3] = deduce(diff[3], cflt.dilated_spatial[1], cflt.stride[1], | |||
cflt.padding[1]); | |||
megdnn_assert(diff[4] == 4); | |||
grad[4] = 4; | |||
} else { | |||
megdnn_assert(param().format == Param::Format::NHWCD4); | |||
megdnn_assert(diff.ndim == 5, | |||
@@ -62,22 +62,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||
threadblock_k_>; \ | |||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | |||
using Convolution = cutlass::convolution::device::Convolution< \ | |||
using Convolution = cutlass::conv::device::Convolution< \ | |||
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | |||
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | |||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
cutlass::convolution::ConvType::kConvolution, \ | |||
cutlass::conv::ConvType::kConvolution, \ | |||
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
cutlass::convolution::threadblock:: \ | |||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||
cutlass::convolution::ConvType::kConvolution>, \ | |||
cutlass::conv::threadblock:: \ | |||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
2, 16, 16, NeedLoadFromConstMem>; \ | |||
typename Convolution::ConvolutionParameter conv_param{ \ | |||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
param.sw, param.ph, param.pw, 1, 1}; \ | |||
typename Convolution::ConvolutionParameter conv_param( \ | |||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
return cutlass_convolution_wrapper<Convolution>( \ | |||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
epilogue, stream); \ | |||
@@ -186,22 +185,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||
threadblock_k_>; \ | |||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
using InstructionShape = cutlass::gemm::GemmShape<8, 8, 16>; \ | |||
using Convolution = cutlass::convolution::device::Convolution< \ | |||
using Convolution = cutlass::conv::device::Convolution< \ | |||
int8_t, cutlass::layout::TensorNCxHWx<32>, int8_t, \ | |||
cutlass::layout::TensorCxRSKx<32>, ElementOutput, \ | |||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
cutlass::convolution::ConvType::kConvolution, \ | |||
cutlass::conv::ConvType::kConvolution, \ | |||
cutlass::arch::OpClassTensorOp, cutlass::arch::Sm75, \ | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
cutlass::convolution::threadblock:: \ | |||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||
cutlass::convolution::ConvType::kConvolution>, \ | |||
cutlass::conv::threadblock:: \ | |||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
2, 16, 16, NeedLoadFromConstMem>; \ | |||
typename Convolution::ConvolutionParameter conv_param{ \ | |||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
param.sw, param.ph, param.pw, 1, 1}; \ | |||
typename Convolution::ConvolutionParameter conv_param( \ | |||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
return cutlass_convolution_wrapper<Convolution>( \ | |||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
epilogue, stream); \ | |||
@@ -311,22 +309,21 @@ void megdnn::cuda::cutlass_wrapper:: | |||
threadblock_k_>; \ | |||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | |||
using Convolution = cutlass::convolution::device::Convolution< \ | |||
using Convolution = cutlass::conv::device::Convolution< \ | |||
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | |||
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | |||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
cutlass::layout::TensorNCxHWx<4>, int32_t, \ | |||
cutlass::convolution::ConvType::kConvolution, \ | |||
cutlass::conv::ConvType::kConvolution, \ | |||
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
cutlass::convolution::threadblock:: \ | |||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||
cutlass::convolution::ConvType::kConvolution>, \ | |||
cutlass::conv::threadblock:: \ | |||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
stage_, 4, aligned_, NeedLoadFromConstMem>; \ | |||
typename Convolution::ConvolutionParameter conv_param{ \ | |||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
param.sw, param.ph, param.pw, 1, 1}; \ | |||
typename Convolution::ConvolutionParameter conv_param( \ | |||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
return cutlass_convolution_wrapper<Convolution>( \ | |||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
epilogue, stream); \ | |||
@@ -441,23 +438,22 @@ void megdnn::cuda::cutlass_wrapper:: | |||
threadblock_k_>; \ | |||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | |||
using Convolution = cutlass::convolution::device::Convolution< \ | |||
using Convolution = cutlass::conv::device::Convolution< \ | |||
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | |||
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | |||
cutlass::layout::TensorNCHW, float, \ | |||
cutlass::layout::TensorNCHW, int32_t, \ | |||
cutlass::convolution::ConvType::kConvolution, \ | |||
cutlass::conv::ConvType::kConvolution, \ | |||
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
cutlass::convolution::threadblock:: \ | |||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||
cutlass::convolution::ConvType::kConvolution>, \ | |||
cutlass::conv::threadblock:: \ | |||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
stages_, 4, aligned_, NeedLoadFromConstMem, \ | |||
cutlass::arch::OpMultiplyAdd>; \ | |||
typename Convolution::ConvolutionParameter conv_param{ \ | |||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
param.sw, param.ph, param.pw, 1, 1}; \ | |||
typename Convolution::ConvolutionParameter conv_param( \ | |||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
return cutlass_convolution_wrapper<Convolution>( \ | |||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
epilogue, stream); \ | |||
@@ -572,36 +568,35 @@ void megdnn::cuda::cutlass_wrapper:: | |||
threadblock_k_>; \ | |||
using WarpShape = cutlass::gemm::GemmShape<warp_m_, warp_n_, warp_k_>; \ | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; \ | |||
using Convolution = cutlass::convolution::device::Convolution< \ | |||
using Convolution = cutlass::conv::device::Convolution< \ | |||
int8_t, cutlass::layout::TensorNCxHWx<4>, int8_t, \ | |||
cutlass::layout::TensorCxRSKx<4>, ElementOutput, \ | |||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
cutlass::layout::TensorNCxHWx<32>, int32_t, \ | |||
cutlass::convolution::ConvType::kConvolution, \ | |||
cutlass::conv::ConvType::kConvolution, \ | |||
cutlass::arch::OpClassSimt, cutlass::arch::Sm61, \ | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, \ | |||
cutlass::convolution::threadblock:: \ | |||
ConvolutionNCxHWxThreadblockSwizzle< \ | |||
cutlass::convolution::ConvType::kConvolution>, \ | |||
cutlass::conv::threadblock:: \ | |||
ConvolutionFpropNCxHWxThreadblockSwizzle, \ | |||
stages_, 4, aligned_, NeedLoadFromConstMem>; \ | |||
typename Convolution::ConvolutionParameter conv_param{ \ | |||
param.n, param.ci, param.co, param.hi, param.wi, \ | |||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | |||
param.sw, param.ph, param.pw, 1, 1}; \ | |||
typename Convolution::ConvolutionParameter conv_param( \ | |||
param.n, param.hi, param.wi, param.ci, param.co, param.fh, \ | |||
param.fw, param.ho, param.wo, param.ph, param.pw, param.sh, \ | |||
param.sw, 1, 1, cutlass::conv::Mode::kCrossCorrelation); \ | |||
return cutlass_convolution_wrapper<Convolution>( \ | |||
d_src, d_filter, d_bias, d_z, d_dst, workspace, conv_param, \ | |||
epilogue, stream); \ | |||
} | |||
#define DISPATCH_KERNEL \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||
megdnn_assert(false, \ | |||
"unsupported threadblock shape (%dx%dx%d) and warp shape " \ | |||
"(%dx%dx%d)", \ | |||
@@ -29,28 +29,30 @@ void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper( | |||
cudaStream_t stream) { | |||
typename Convolution::TensorRefSrc tensor_src{ | |||
const_cast<typename Convolution::ElementSrc*>(d_src), | |||
Convolution::LayoutSrc::packed({conv_param.n(), conv_param.hi(), | |||
conv_param.wi(), conv_param.ci()})}; | |||
Convolution::LayoutSrc::packed( | |||
{conv_param.N, conv_param.H, conv_param.W, conv_param.C})}; | |||
typename Convolution::TensorRefFilter tensor_filter{ | |||
const_cast<typename Convolution::ElementFilter*>(d_filter), | |||
Convolution::LayoutFilter::packed({conv_param.co(), conv_param.fh(), | |||
conv_param.fw(), | |||
conv_param.ci()})}; | |||
Convolution::LayoutFilter::packed( | |||
{conv_param.K, conv_param.R, conv_param.S, conv_param.C})}; | |||
typename Convolution::TensorRefBias tensor_bias{ | |||
const_cast<typename Convolution::ElementBias*>(d_bias), | |||
Convolution::LayoutBias::packed({1, 1, 1, conv_param.co()})}; | |||
Convolution::LayoutBias::packed({1, 1, 1, conv_param.K})}; | |||
typename Convolution::TensorRefDst tensor_z{ | |||
const_cast<typename Convolution::ElementDst*>(d_z), | |||
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||
conv_param.wo(), conv_param.co()})}; | |||
Convolution::LayoutDst::packed( | |||
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||
typename Convolution::TensorRefDst tensor_dst{ | |||
d_dst, | |||
Convolution::LayoutDst::packed({conv_param.n(), conv_param.ho(), | |||
conv_param.wo(), conv_param.co()})}; | |||
typename Convolution::Arguments arguments{ | |||
conv_param, tensor_src, tensor_filter, | |||
tensor_bias, tensor_z, tensor_dst.non_const_ref(), | |||
epilogue}; | |||
Convolution::LayoutDst::packed( | |||
{conv_param.N, conv_param.P, conv_param.Q, conv_param.K})}; | |||
typename Convolution::Arguments arguments{conv_param, | |||
tensor_src.non_const_ref(), | |||
tensor_filter.non_const_ref(), | |||
tensor_bias.non_const_ref(), | |||
tensor_z.non_const_ref(), | |||
tensor_dst.non_const_ref(), | |||
epilogue}; | |||
Convolution conv_op; | |||
cutlass_check(conv_op.initialize(arguments, workspace)); | |||
cutlass_check(conv_op(stream)); | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, 4, 8, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, 4, 8, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, 4, 8, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 4, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 4, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 4, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, 4, 8, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, 4, 8, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
1, 4, 8, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 4, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 4, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<16, 64, 8>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 4, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, true, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<32, 64, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||
@@ -14,13 +14,12 @@ using WarpShape = cutlass::gemm::GemmShape<64, 32, 32>; | |||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||
int8_t, 4, int32_t, int32_t, float>; | |||
using Convolution = cutlass::convolution::device::Convolution< | |||
using Convolution = cutlass::conv::device::Convolution< | |||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | |||
LayoutDst, int32_t, LayoutDst, int32_t, | |||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
cutlass::conv::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||
cutlass::convolution::ConvType::kConvolution>, | |||
cutlass::conv::threadblock::ConvolutionFpropNCxHWxThreadblockSwizzle, | |||
2, 4, 16, false, | |||
cutlass::arch::OpMultiplyAddSaturate>; | |||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||