GitOrigin-RevId: 878b7de9de
tags/v1.3.0
@@ -286,7 +286,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
uint32_t /* nonlinear_mode */, float /* alpha */, | uint32_t /* nonlinear_mode */, float /* alpha */, | ||||
float /* beta */, float /* gamma */, float /* scale */, | float /* beta */, float /* gamma */, float /* scale */, | ||||
const GemmCoord& /* threadblock_shape */, | const GemmCoord& /* threadblock_shape */, | ||||
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||||
const GemmCoord& /* warp_shape */, int /* stages */, | |||||
cudaStream_t /* stream */) {} | |||||
#else | #else | ||||
template <bool NeedLoadFromConstMem> | template <bool NeedLoadFromConstMem> | ||||
void megdnn::cuda::cutlass_wrapper:: | void megdnn::cuda::cutlass_wrapper:: | ||||
@@ -296,15 +297,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
int* workspace, const convolution::ConvParam& param, | int* workspace, const convolution::ConvParam& param, | ||||
uint32_t nonlinear_mode, float alpha, float beta, float gamma, | uint32_t nonlinear_mode, float alpha, float beta, float gamma, | ||||
float scale, const GemmCoord& threadblock_shape, | float scale, const GemmCoord& threadblock_shape, | ||||
const GemmCoord& warp_shape, cudaStream_t stream) { | |||||
const GemmCoord& warp_shape, int stages, cudaStream_t stream) { | |||||
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_, warp_m_, warp_n_, \ | threadblock_k_, warp_m_, warp_n_, \ | ||||
warp_k_, stage_, aligned_) \ | |||||
warp_k_, stage_, aligned_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | if (threadblock_shape.m() == threadblock_m_ && \ | ||||
threadblock_shape.n() == threadblock_n_ && \ | threadblock_shape.n() == threadblock_n_ && \ | ||||
threadblock_shape.k() == threadblock_k_ && \ | threadblock_shape.k() == threadblock_k_ && \ | ||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | ||||
warp_shape.k() == warp_k_) { \ | |||||
warp_shape.k() == warp_k_ && stages == stage_) { \ | |||||
using ThreadBlockShape = \ | using ThreadBlockShape = \ | ||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
@@ -397,7 +398,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
uint32_t nonlinear_mode, float alpha, float beta, \ | uint32_t nonlinear_mode, float alpha, float beta, \ | ||||
float gamma, float scale, \ | float gamma, float scale, \ | ||||
const GemmCoord& threadblock_shape, \ | const GemmCoord& threadblock_shape, \ | ||||
const GemmCoord& warp_shape, cudaStream_t stream); | |||||
const GemmCoord& warp_shape, int stages, \ | |||||
cudaStream_t stream); | |||||
INST(true); | INST(true); | ||||
INST(false); | INST(false); | ||||
#undef INST | #undef INST | ||||
@@ -414,7 +416,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
uint32_t /* nonlinear_mode */, float /* alpha */, | uint32_t /* nonlinear_mode */, float /* alpha */, | ||||
float /* beta */, float /* gamma */, float /* scale */, | float /* beta */, float /* gamma */, float /* scale */, | ||||
const GemmCoord& /* threadblock_shape */, | const GemmCoord& /* threadblock_shape */, | ||||
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||||
const GemmCoord& /* warp_shape */, int /* stages */, | |||||
cudaStream_t /* stream */) {} | |||||
#else | #else | ||||
template <bool NeedLoadFromConstMem> | template <bool NeedLoadFromConstMem> | ||||
void megdnn::cuda::cutlass_wrapper:: | void megdnn::cuda::cutlass_wrapper:: | ||||
@@ -424,15 +427,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
int* workspace, const convolution::ConvParam& param, | int* workspace, const convolution::ConvParam& param, | ||||
uint32_t nonlinear_mode, float alpha, float beta, float gamma, | uint32_t nonlinear_mode, float alpha, float beta, float gamma, | ||||
float scale, const GemmCoord& threadblock_shape, | float scale, const GemmCoord& threadblock_shape, | ||||
const GemmCoord& warp_shape, cudaStream_t stream) { | |||||
const GemmCoord& warp_shape, int stages, cudaStream_t stream) { | |||||
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_, warp_m_, warp_n_, \ | threadblock_k_, warp_m_, warp_n_, \ | ||||
warp_k_, aligned_) \ | |||||
warp_k_, stages_, aligned_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | if (threadblock_shape.m() == threadblock_m_ && \ | ||||
threadblock_shape.n() == threadblock_n_ && \ | threadblock_shape.n() == threadblock_n_ && \ | ||||
threadblock_shape.k() == threadblock_k_ && \ | threadblock_shape.k() == threadblock_k_ && \ | ||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | ||||
warp_shape.k() == warp_k_) { \ | |||||
warp_shape.k() == warp_k_ && stages == stages_) { \ | |||||
using ThreadBlockShape = \ | using ThreadBlockShape = \ | ||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
@@ -449,7 +452,7 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
cutlass::convolution::threadblock:: \ | cutlass::convolution::threadblock:: \ | ||||
ConvolutionNCxHWxThreadblockSwizzle< \ | ConvolutionNCxHWxThreadblockSwizzle< \ | ||||
cutlass::convolution::ConvType::kConvolution>, \ | cutlass::convolution::ConvType::kConvolution>, \ | ||||
2, 4, aligned_, NeedLoadFromConstMem, \ | |||||
stages_, 4, aligned_, NeedLoadFromConstMem, \ | |||||
cutlass::arch::OpMultiplyAdd>; \ | cutlass::arch::OpMultiplyAdd>; \ | ||||
typename Convolution::ConvolutionParameter conv_param{ \ | typename Convolution::ConvolutionParameter conv_param{ \ | ||||
param.n, param.ci, param.co, param.hi, param.wi, \ | param.n, param.ci, param.co, param.hi, param.wi, \ | ||||
@@ -460,16 +463,17 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
epilogue, stream); \ | epilogue, stream); \ | ||||
} | } | ||||
#define DISPATCH_KERNEL \ | #define DISPATCH_KERNEL \ | ||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \ | |||||
megdnn_assert(false, \ | megdnn_assert(false, \ | ||||
"unsupported threadblock shape (%dx%dx%d) and warp shape " \ | "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | ||||
"(%dx%dx%d)", \ | "(%dx%dx%d)", \ | ||||
@@ -525,7 +529,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
uint32_t nonlinear_mode, float alpha, float beta, \ | uint32_t nonlinear_mode, float alpha, float beta, \ | ||||
float gamma, float scale, \ | float gamma, float scale, \ | ||||
const GemmCoord& threadblock_shape, \ | const GemmCoord& threadblock_shape, \ | ||||
const GemmCoord& warp_shape, cudaStream_t stream); | |||||
const GemmCoord& warp_shape, int stages, \ | |||||
cudaStream_t stream); | |||||
INST(true); | INST(true); | ||||
INST(false); | INST(false); | ||||
#undef INST | #undef INST | ||||
@@ -542,7 +547,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
uint32_t /* nonlinear_mode */, float /* alpha */, | uint32_t /* nonlinear_mode */, float /* alpha */, | ||||
float /* beta */, float /* gamma */, float /* scale */, | float /* beta */, float /* gamma */, float /* scale */, | ||||
const GemmCoord& /* threadblock_shape */, | const GemmCoord& /* threadblock_shape */, | ||||
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {} | |||||
const GemmCoord& /* warp_shape */, int /* stages */, | |||||
cudaStream_t /* stream */) {} | |||||
#else | #else | ||||
template <bool NeedLoadFromConstMem> | template <bool NeedLoadFromConstMem> | ||||
void megdnn::cuda::cutlass_wrapper:: | void megdnn::cuda::cutlass_wrapper:: | ||||
@@ -552,15 +558,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
int* workspace, const convolution::ConvParam& param, | int* workspace, const convolution::ConvParam& param, | ||||
uint32_t nonlinear_mode, float alpha, float beta, float gamma, | uint32_t nonlinear_mode, float alpha, float beta, float gamma, | ||||
float scale, const GemmCoord& threadblock_shape, | float scale, const GemmCoord& threadblock_shape, | ||||
const GemmCoord& warp_shape, cudaStream_t stream) { | |||||
const GemmCoord& warp_shape, int stages, cudaStream_t stream) { | |||||
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | #define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_, warp_m_, warp_n_, \ | threadblock_k_, warp_m_, warp_n_, \ | ||||
warp_k_, aligned_) \ | |||||
warp_k_, stages_, aligned_) \ | |||||
if (threadblock_shape.m() == threadblock_m_ && \ | if (threadblock_shape.m() == threadblock_m_ && \ | ||||
threadblock_shape.n() == threadblock_n_ && \ | threadblock_shape.n() == threadblock_n_ && \ | ||||
threadblock_shape.k() == threadblock_k_ && \ | threadblock_shape.k() == threadblock_k_ && \ | ||||
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \ | ||||
warp_shape.k() == warp_k_) { \ | |||||
warp_shape.k() == warp_k_ && stages == stages_) { \ | |||||
using ThreadBlockShape = \ | using ThreadBlockShape = \ | ||||
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \ | ||||
threadblock_k_>; \ | threadblock_k_>; \ | ||||
@@ -577,7 +583,7 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
cutlass::convolution::threadblock:: \ | cutlass::convolution::threadblock:: \ | ||||
ConvolutionNCxHWxThreadblockSwizzle< \ | ConvolutionNCxHWxThreadblockSwizzle< \ | ||||
cutlass::convolution::ConvType::kConvolution>, \ | cutlass::convolution::ConvType::kConvolution>, \ | ||||
2, 4, aligned_, NeedLoadFromConstMem>; \ | |||||
stages_, 4, aligned_, NeedLoadFromConstMem>; \ | |||||
typename Convolution::ConvolutionParameter conv_param{ \ | typename Convolution::ConvolutionParameter conv_param{ \ | ||||
param.n, param.ci, param.co, param.hi, param.wi, \ | param.n, param.ci, param.co, param.hi, param.wi, \ | ||||
param.fh, param.fw, param.ho, param.wo, param.sh, \ | param.fh, param.fw, param.ho, param.wo, param.sh, \ | ||||
@@ -587,15 +593,15 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
epilogue, stream); \ | epilogue, stream); \ | ||||
} | } | ||||
#define DISPATCH_KERNEL \ | #define DISPATCH_KERNEL \ | ||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \ | |||||
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \ | |||||
megdnn_assert(false, \ | megdnn_assert(false, \ | ||||
"unsupported threadblock shape (%dx%dx%d) and warp shape " \ | "unsupported threadblock shape (%dx%dx%d) and warp shape " \ | ||||
"(%dx%dx%d)", \ | "(%dx%dx%d)", \ | ||||
@@ -651,7 +657,8 @@ void megdnn::cuda::cutlass_wrapper:: | |||||
uint32_t nonlinear_mode, float alpha, float beta, \ | uint32_t nonlinear_mode, float alpha, float beta, \ | ||||
float gamma, float scale, \ | float gamma, float scale, \ | ||||
const GemmCoord& threadblock_shape, \ | const GemmCoord& threadblock_shape, \ | ||||
const GemmCoord& warp_shape, cudaStream_t stream); | |||||
const GemmCoord& warp_shape, int stages, \ | |||||
cudaStream_t stream); | |||||
INST(true); | INST(true); | ||||
INST(false); | INST(false); | ||||
#undef INST | #undef INST | ||||
@@ -56,7 +56,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4( | |||||
const convolution::ConvParam& param, uint32_t nonlinear_mode, | const convolution::ConvParam& param, uint32_t nonlinear_mode, | ||||
float alpha, float beta, float gamma, float scale, | float alpha, float beta, float gamma, float scale, | ||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | ||||
cudaStream_t stream); | |||||
int stages, cudaStream_t stream); | |||||
template <bool NeedLoadFromConstMem> | template <bool NeedLoadFromConstMem> | ||||
void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( | void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( | ||||
@@ -65,7 +65,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw( | |||||
const convolution::ConvParam& param, uint32_t nonlinear_mode, | const convolution::ConvParam& param, uint32_t nonlinear_mode, | ||||
float alpha, float beta, float gamma, float scale, | float alpha, float beta, float gamma, float scale, | ||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | ||||
cudaStream_t stream); | |||||
int stages, cudaStream_t stream); | |||||
template <bool NeedLoadFromConstMem> | template <bool NeedLoadFromConstMem> | ||||
void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( | void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( | ||||
@@ -74,7 +74,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32( | |||||
const convolution::ConvParam& param, uint32_t nonlinear_mode, | const convolution::ConvParam& param, uint32_t nonlinear_mode, | ||||
float alpha, float beta, float gamma, float scale, | float alpha, float beta, float gamma, float scale, | ||||
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | const GemmCoord& threadblock_shape, const GemmCoord& warp_shape, | ||||
cudaStream_t stream); | |||||
int stages, cudaStream_t stream); | |||||
} // namespace cutlass_wrapper | } // namespace cutlass_wrapper | ||||
} // namespace cuda | } // namespace cuda | ||||
@@ -32,8 +32,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available( | |||||
if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout), | ||||
param.format)) | param.format)) | ||||
return false; | return false; | ||||
if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW && | |||||
param.format != Format::NCHW4_NCHW32) | |||||
if (param.format == Format::NCHW4_NCHW32) { | |||||
if (m_algo_param.threadblock_m % 32 != 0) | |||||
return false; | |||||
} else if (param.format != Format::NCHW4_NCHW && | |||||
param.format != Format::NCHW4) | |||||
return false; | return false; | ||||
size_t n = args.src_layout->operator[](0), | size_t n = args.src_layout->operator[](0), | ||||
ci = args.src_layout->operator[](1) * 4, | ci = args.src_layout->operator[](1) * 4, | ||||
@@ -187,7 +190,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | ||||
m_algo_param.warp_n, | m_algo_param.warp_n, | ||||
m_algo_param.warp_k}, | m_algo_param.warp_k}, | ||||
stream); | |||||
m_algo_param.stage, stream); | |||||
} else if (param.format == Format::NCHW4_NCHW) { | } else if (param.format == Format::NCHW4_NCHW) { | ||||
cutlass_wrapper:: | cutlass_wrapper:: | ||||
do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<false>( | do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<false>( | ||||
@@ -205,7 +208,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | ||||
m_algo_param.warp_n, | m_algo_param.warp_n, | ||||
m_algo_param.warp_k}, | m_algo_param.warp_k}, | ||||
stream); | |||||
m_algo_param.stage, stream); | |||||
} else { | } else { | ||||
megdnn_assert(param.format == Format::NCHW4_NCHW32); | megdnn_assert(param.format == Format::NCHW4_NCHW32); | ||||
cutlass_wrapper:: | cutlass_wrapper:: | ||||
@@ -225,7 +228,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | ||||
m_algo_param.warp_n, | m_algo_param.warp_n, | ||||
m_algo_param.warp_k}, | m_algo_param.warp_k}, | ||||
stream); | |||||
m_algo_param.stage, stream); | |||||
} | } | ||||
} else { | } else { | ||||
if (param.format == Format::NCHW4) { | if (param.format == Format::NCHW4) { | ||||
@@ -242,7 +245,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | ||||
m_algo_param.warp_n, | m_algo_param.warp_n, | ||||
m_algo_param.warp_k}, | m_algo_param.warp_k}, | ||||
stream); | |||||
m_algo_param.stage, stream); | |||||
} else if (param.format == Format::NCHW4_NCHW) { | } else if (param.format == Format::NCHW4_NCHW) { | ||||
cutlass_wrapper:: | cutlass_wrapper:: | ||||
do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<true>( | do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<true>( | ||||
@@ -260,7 +263,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | ||||
m_algo_param.warp_n, | m_algo_param.warp_n, | ||||
m_algo_param.warp_k}, | m_algo_param.warp_k}, | ||||
stream); | |||||
m_algo_param.stage, stream); | |||||
} else { | } else { | ||||
megdnn_assert(param.format == Format::NCHW4_NCHW32); | megdnn_assert(param.format == Format::NCHW4_NCHW32); | ||||
@@ -281,7 +284,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec( | |||||
cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | cutlass_wrapper::GemmCoord{m_algo_param.warp_m, | ||||
m_algo_param.warp_n, | m_algo_param.warp_n, | ||||
m_algo_param.warp_k}, | m_algo_param.warp_k}, | ||||
stream); | |||||
m_algo_param.stage, stream); | |||||
} | } | ||||
} | } | ||||
after_kernel_launch(); | after_kernel_launch(); | ||||
@@ -8,6 +8,7 @@ | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | ||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | ||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam | |||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | using Convolution = cutlass::convolution::device::Convolution< | ||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutSrc, int32_t, LayoutSrc, int32_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | ||||
cutlass::convolution::ConvType::kConvolution>, | cutlass::convolution::ConvType::kConvolution>, | ||||
1, 4, 8, true>; | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
const int8_t* d_src, | |||||
const int8_t* d_filter, | |||||
const int32_t* d_bias, | |||||
const int8_t* d_z, | |||||
int8_t* d_dst, | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | int* workspace, | ||||
typename Convolution::ConvolutionParameter const& conv_param, | typename Convolution::ConvolutionParameter const& conv_param, | ||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | typename Convolution::EpilogueOutputOp::Params const& epilogue, | ||||
@@ -8,6 +8,7 @@ | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | ||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | ||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | using Convolution = cutlass::convolution::device::Convolution< | ||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutSrc, int32_t, LayoutSrc, int32_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | ||||
cutlass::convolution::ConvType::kConvolution>, | cutlass::convolution::ConvType::kConvolution>, | ||||
1, 4, 8, true>; | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
const int8_t* d_src, | |||||
const int8_t* d_filter, | |||||
const int32_t* d_bias, | |||||
const int8_t* d_z, | |||||
int8_t* d_dst, | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | int* workspace, | ||||
typename Convolution::ConvolutionParameter const& conv_param, | typename Convolution::ConvolutionParameter const& conv_param, | ||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | typename Convolution::EpilogueOutputOp::Params const& epilogue, | ||||
@@ -8,6 +8,7 @@ | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | ||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | ||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | using Convolution = cutlass::convolution::device::Convolution< | ||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutSrc, int32_t, LayoutSrc, int32_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | ||||
cutlass::convolution::ConvType::kConvolution>, | cutlass::convolution::ConvType::kConvolution>, | ||||
1, 4, 8, true>; | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
const int8_t* d_src, | |||||
const int8_t* d_filter, | |||||
const int32_t* d_bias, | |||||
const int8_t* d_z, | |||||
int8_t* d_dst, | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | int* workspace, | ||||
typename Convolution::ConvolutionParameter const& conv_param, | typename Convolution::ConvolutionParameter const& conv_param, | ||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | typename Convolution::EpilogueOutputOp::Params const& epilogue, | ||||
@@ -8,6 +8,7 @@ | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | ||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | ||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam | |||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | using Convolution = cutlass::convolution::device::Convolution< | ||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutSrc, int32_t, LayoutSrc, int32_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | ||||
cutlass::convolution::ConvType::kConvolution>, | cutlass::convolution::ConvType::kConvolution>, | ||||
1, 4, 8, false>; | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
const int8_t* d_src, | |||||
const int8_t* d_filter, | |||||
const int32_t* d_bias, | |||||
const int8_t* d_z, | |||||
int8_t* d_dst, | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | int* workspace, | ||||
typename Convolution::ConvolutionParameter const& conv_param, | typename Convolution::ConvolutionParameter const& conv_param, | ||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | typename Convolution::EpilogueOutputOp::Params const& epilogue, | ||||
@@ -8,6 +8,7 @@ | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | ||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | ||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | using Convolution = cutlass::convolution::device::Convolution< | ||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutSrc, int32_t, LayoutSrc, int32_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | ||||
cutlass::convolution::ConvType::kConvolution>, | cutlass::convolution::ConvType::kConvolution>, | ||||
1, 4, 8, false>; | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
const int8_t* d_src, | |||||
const int8_t* d_filter, | |||||
const int32_t* d_bias, | |||||
const int8_t* d_z, | |||||
int8_t* d_dst, | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | int* workspace, | ||||
typename Convolution::ConvolutionParameter const& conv_param, | typename Convolution::ConvolutionParameter const& conv_param, | ||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | typename Convolution::EpilogueOutputOp::Params const& epilogue, | ||||
@@ -8,6 +8,7 @@ | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | ||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | ||||
using LayoutDst = cutlass::layout::TensorNCxHWx<4>; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | ||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | ||||
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp< | |||||
int8_t, 4, int32_t, int32_t, float>; | int8_t, 4, int32_t, int32_t, float>; | ||||
using Convolution = cutlass::convolution::device::Convolution< | using Convolution = cutlass::convolution::device::Convolution< | ||||
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t, | ||||
LayoutSrc, int32_t, LayoutSrc, int32_t, | |||||
LayoutDst, int32_t, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | ||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | ||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | ||||
cutlass::convolution::ConvType::kConvolution>, | cutlass::convolution::ConvType::kConvolution>, | ||||
1, 4, 8, false>; | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAddSaturate>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | ||||
const int8_t* d_src, | |||||
const int8_t* d_filter, | |||||
const int32_t* d_bias, | |||||
const int8_t* d_z, | |||||
int8_t* d_dst, | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | int* workspace, | ||||
typename Convolution::ConvolutionParameter const& conv_param, | typename Convolution::ConvolutionParameter const& conv_param, | ||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | typename Convolution::EpilogueOutputOp::Params const& epilogue, | ||||
@@ -0,0 +1,37 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCHW; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, 1, int32_t, float, float>; | |||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||||
LayoutDst, float, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAdd>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,37 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCHW; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, 1, int32_t, float, float>; | |||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||||
LayoutDst, float, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAdd>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,37 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCHW; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< | |||||
float, 1, int32_t, float, float>; | |||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||||
LayoutDst, float, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
1, 4, 8, true, | |||||
cutlass::arch::OpMultiplyAdd>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,37 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCHW; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish< | |||||
float, 1, int32_t, float, float>; | |||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||||
LayoutDst, float, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAdd>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,37 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCHW; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination< | |||||
float, 1, int32_t, float, float>; | |||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||||
LayoutDst, float, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAdd>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -0,0 +1,37 @@ | |||||
#if !MEGDNN_TEGRA_X1 | |||||
// generated by gen_cuda_conv_bias_kern_impls.py | |||||
// ignore warning of cutlass | |||||
#pragma GCC diagnostic push | |||||
#pragma GCC diagnostic ignored "-Wunused-parameter" | |||||
#pragma GCC diagnostic ignored "-Wstrict-aliasing" | |||||
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl" | |||||
using LayoutSrc = cutlass::layout::TensorNCxHWx<4>; | |||||
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>; | |||||
using LayoutDst = cutlass::layout::TensorNCHW; | |||||
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>; | |||||
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>; | |||||
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu< | |||||
float, 1, int32_t, float, float>; | |||||
using Convolution = cutlass::convolution::device::Convolution< | |||||
int8_t, LayoutSrc, int8_t, LayoutFilter, float, | |||||
LayoutDst, float, LayoutDst, int32_t, | |||||
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61, | |||||
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp, | |||||
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle< | |||||
cutlass::convolution::ConvType::kConvolution>, | |||||
1, 4, 8, false, | |||||
cutlass::arch::OpMultiplyAdd>; | |||||
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>( | |||||
const typename Convolution::ElementSrc* d_src, | |||||
const typename Convolution::ElementFilter* d_filter, | |||||
const typename Convolution::ElementBias* d_bias, | |||||
const typename Convolution::ElementDst* d_z, | |||||
typename Convolution::ElementDst* d_dst, | |||||
int* workspace, | |||||
typename Convolution::ConvolutionParameter const& conv_param, | |||||
typename Convolution::EpilogueOutputOp::Params const& epilogue, | |||||
cudaStream_t stream); | |||||
#pragma GCC diagnostic pop | |||||
#endif |
@@ -3895,6 +3895,9 @@ TEST(TestGoptInference, FoldingConvDimshuffle) { | |||||
.apply({{y}}) | .apply({{y}}) | ||||
.endpoint_vars(), | .endpoint_vars(), | ||||
y_fuse); | y_fuse); | ||||
gopt::modify_opr_algo_strategy_inplace( | |||||
{y_fuse}, | |||||
opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE); | |||||
graph->compile({{y_fuse, {}}}) | graph->compile({{y_fuse, {}}}) | ||||
->to_json() | ->to_json() | ||||
->writeto_fpath(output_file( | ->writeto_fpath(output_file( | ||||
@@ -3976,6 +3979,9 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) { | |||||
.apply({{y}}) | .apply({{y}}) | ||||
.endpoint_vars(), | .endpoint_vars(), | ||||
y_fuse); | y_fuse); | ||||
gopt::modify_opr_algo_strategy_inplace( | |||||
{y_fuse}, | |||||
opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE); | |||||
graph->compile({{y_fuse, {}}}) | graph->compile({{y_fuse, {}}}) | ||||
->to_json() | ->to_json() | ||||
->writeto_fpath(output_file( | ->writeto_fpath(output_file( | ||||