Browse Source

fix(mgb/gopt): fix folding conv dimshuffle opt pass

GitOrigin-RevId: 878b7de9de
tags/v1.3.0
Megvii Engine Team 4 years ago
parent
commit
2e4b9a42f7
16 changed files with 339 additions and 89 deletions
  1. +43
    -36
      dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu
  2. +3
    -3
      dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh
  3. +11
    -8
      dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp
  4. +9
    -7
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu
  5. +9
    -7
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu
  6. +9
    -7
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu
  7. +9
    -7
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu
  8. +9
    -7
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu
  9. +9
    -7
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu
  10. +37
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu
  11. +37
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu
  12. +37
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu
  13. +37
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu
  14. +37
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu
  15. +37
    -0
      dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu
  16. +6
    -0
      src/gopt/test/inference.cpp

+ 43
- 36
dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cu View File

@@ -286,7 +286,8 @@ void megdnn::cuda::cutlass_wrapper::
uint32_t /* nonlinear_mode */, float /* alpha */,
float /* beta */, float /* gamma */, float /* scale */,
const GemmCoord& /* threadblock_shape */,
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
const GemmCoord& /* warp_shape */, int /* stages */,
cudaStream_t /* stream */) {}
#else
template <bool NeedLoadFromConstMem>
void megdnn::cuda::cutlass_wrapper::
@@ -296,15 +297,15 @@ void megdnn::cuda::cutlass_wrapper::
int* workspace, const convolution::ConvParam& param,
uint32_t nonlinear_mode, float alpha, float beta, float gamma,
float scale, const GemmCoord& threadblock_shape,
const GemmCoord& warp_shape, cudaStream_t stream) {
const GemmCoord& warp_shape, int stages, cudaStream_t stream) {
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \
threadblock_k_, warp_m_, warp_n_, \
warp_k_, stage_, aligned_) \
warp_k_, stage_, aligned_) \
if (threadblock_shape.m() == threadblock_m_ && \
threadblock_shape.n() == threadblock_n_ && \
threadblock_shape.k() == threadblock_k_ && \
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \
warp_shape.k() == warp_k_) { \
warp_shape.k() == warp_k_ && stages == stage_) { \
using ThreadBlockShape = \
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \
threadblock_k_>; \
@@ -397,7 +398,8 @@ void megdnn::cuda::cutlass_wrapper::
uint32_t nonlinear_mode, float alpha, float beta, \
float gamma, float scale, \
const GemmCoord& threadblock_shape, \
const GemmCoord& warp_shape, cudaStream_t stream);
const GemmCoord& warp_shape, int stages, \
cudaStream_t stream);
INST(true);
INST(false);
#undef INST
@@ -414,7 +416,8 @@ void megdnn::cuda::cutlass_wrapper::
uint32_t /* nonlinear_mode */, float /* alpha */,
float /* beta */, float /* gamma */, float /* scale */,
const GemmCoord& /* threadblock_shape */,
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
const GemmCoord& /* warp_shape */, int /* stages */,
cudaStream_t /* stream */) {}
#else
template <bool NeedLoadFromConstMem>
void megdnn::cuda::cutlass_wrapper::
@@ -424,15 +427,15 @@ void megdnn::cuda::cutlass_wrapper::
int* workspace, const convolution::ConvParam& param,
uint32_t nonlinear_mode, float alpha, float beta, float gamma,
float scale, const GemmCoord& threadblock_shape,
const GemmCoord& warp_shape, cudaStream_t stream) {
const GemmCoord& warp_shape, int stages, cudaStream_t stream) {
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \
threadblock_k_, warp_m_, warp_n_, \
warp_k_, aligned_) \
warp_k_, stages_, aligned_) \
if (threadblock_shape.m() == threadblock_m_ && \
threadblock_shape.n() == threadblock_n_ && \
threadblock_shape.k() == threadblock_k_ && \
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \
warp_shape.k() == warp_k_) { \
warp_shape.k() == warp_k_ && stages == stages_) { \
using ThreadBlockShape = \
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \
threadblock_k_>; \
@@ -449,7 +452,7 @@ void megdnn::cuda::cutlass_wrapper::
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
2, 4, aligned_, NeedLoadFromConstMem, \
stages_, 4, aligned_, NeedLoadFromConstMem, \
cutlass::arch::OpMultiplyAdd>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
@@ -460,16 +463,17 @@ void megdnn::cuda::cutlass_wrapper::
epilogue, stream); \
}
#define DISPATCH_KERNEL \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 4); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 128, 16, 16, 128, 16, 1, 8); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(16, 64, 8, 16, 64, 8, 2, 4); \
megdnn_assert(false, \
"unsupported threadblock shape (%dx%dx%d) and warp shape " \
"(%dx%dx%d)", \
@@ -525,7 +529,8 @@ void megdnn::cuda::cutlass_wrapper::
uint32_t nonlinear_mode, float alpha, float beta, \
float gamma, float scale, \
const GemmCoord& threadblock_shape, \
const GemmCoord& warp_shape, cudaStream_t stream);
const GemmCoord& warp_shape, int stages, \
cudaStream_t stream);
INST(true);
INST(false);
#undef INST
@@ -542,7 +547,8 @@ void megdnn::cuda::cutlass_wrapper::
uint32_t /* nonlinear_mode */, float /* alpha */,
float /* beta */, float /* gamma */, float /* scale */,
const GemmCoord& /* threadblock_shape */,
const GemmCoord& /* warp_shape */, cudaStream_t /* stream */) {}
const GemmCoord& /* warp_shape */, int /* stages */,
cudaStream_t /* stream */) {}
#else
template <bool NeedLoadFromConstMem>
void megdnn::cuda::cutlass_wrapper::
@@ -552,15 +558,15 @@ void megdnn::cuda::cutlass_wrapper::
int* workspace, const convolution::ConvParam& param,
uint32_t nonlinear_mode, float alpha, float beta, float gamma,
float scale, const GemmCoord& threadblock_shape,
const GemmCoord& warp_shape, cudaStream_t stream) {
const GemmCoord& warp_shape, int stages, cudaStream_t stream) {
#define DISPATCH_KERNEL_WITH_TILE_SHAPE(threadblock_m_, threadblock_n_, \
threadblock_k_, warp_m_, warp_n_, \
warp_k_, aligned_) \
warp_k_, stages_, aligned_) \
if (threadblock_shape.m() == threadblock_m_ && \
threadblock_shape.n() == threadblock_n_ && \
threadblock_shape.k() == threadblock_k_ && \
warp_shape.m() == warp_m_ && warp_shape.n() == warp_n_ && \
warp_shape.k() == warp_k_) { \
warp_shape.k() == warp_k_ && stages == stages_) { \
using ThreadBlockShape = \
cutlass::gemm::GemmShape<threadblock_m_, threadblock_n_, \
threadblock_k_>; \
@@ -577,7 +583,7 @@ void megdnn::cuda::cutlass_wrapper::
cutlass::convolution::threadblock:: \
ConvolutionNCxHWxThreadblockSwizzle< \
cutlass::convolution::ConvType::kConvolution>, \
2, 4, aligned_, NeedLoadFromConstMem>; \
stages_, 4, aligned_, NeedLoadFromConstMem>; \
typename Convolution::ConvolutionParameter conv_param{ \
param.n, param.ci, param.co, param.hi, param.wi, \
param.fh, param.fw, param.ho, param.wo, param.sh, \
@@ -587,15 +593,15 @@ void megdnn::cuda::cutlass_wrapper::
epilogue, stream); \
}
#define DISPATCH_KERNEL \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 128, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(128, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 128, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 64, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 64, 32, 32, 64, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(64, 32, 32, 64, 32, 32, 2, 16); \
DISPATCH_KERNEL_WITH_TILE_SHAPE(32, 32, 32, 32, 32, 32, 2, 16); \
megdnn_assert(false, \
"unsupported threadblock shape (%dx%dx%d) and warp shape " \
"(%dx%dx%d)", \
@@ -651,7 +657,8 @@ void megdnn::cuda::cutlass_wrapper::
uint32_t nonlinear_mode, float alpha, float beta, \
float gamma, float scale, \
const GemmCoord& threadblock_shape, \
const GemmCoord& warp_shape, cudaStream_t stream);
const GemmCoord& warp_shape, int stages, \
cudaStream_t stream);
INST(true);
INST(false);
#undef INST


+ 3
- 3
dnn/src/cuda/conv_bias/cutlass_convolution_wrapper.cuh View File

@@ -56,7 +56,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4(
const convolution::ConvParam& param, uint32_t nonlinear_mode,
float alpha, float beta, float gamma, float scale,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
cudaStream_t stream);
int stages, cudaStream_t stream);

template <bool NeedLoadFromConstMem>
void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw(
@@ -65,7 +65,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw(
const convolution::ConvParam& param, uint32_t nonlinear_mode,
float alpha, float beta, float gamma, float scale,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
cudaStream_t stream);
int stages, cudaStream_t stream);

template <bool NeedLoadFromConstMem>
void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32(
@@ -74,7 +74,7 @@ void do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_ncdiv32hw32(
const convolution::ConvParam& param, uint32_t nonlinear_mode,
float alpha, float beta, float gamma, float scale,
const GemmCoord& threadblock_shape, const GemmCoord& warp_shape,
cudaStream_t stream);
int stages, cudaStream_t stream);

} // namespace cutlass_wrapper
} // namespace cuda


+ 11
- 8
dnn/src/cuda/conv_bias/implicit_gemm_int8_nchw4_dp4a.cpp View File

@@ -32,8 +32,11 @@ bool ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::is_available(
if (!conv_bias::check_bias_share_in_channel(*(args.bias_layout),
param.format))
return false;
if (param.format != Format::NCHW4 && param.format != Format::NCHW4_NCHW &&
param.format != Format::NCHW4_NCHW32)
if (param.format == Format::NCHW4_NCHW32) {
if (m_algo_param.threadblock_m % 32 != 0)
return false;
} else if (param.format != Format::NCHW4_NCHW &&
param.format != Format::NCHW4)
return false;
size_t n = args.src_layout->operator[](0),
ci = args.src_layout->operator[](1) * 4,
@@ -187,7 +190,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k},
stream);
m_algo_param.stage, stream);
} else if (param.format == Format::NCHW4_NCHW) {
cutlass_wrapper::
do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<false>(
@@ -205,7 +208,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k},
stream);
m_algo_param.stage, stream);
} else {
megdnn_assert(param.format == Format::NCHW4_NCHW32);
cutlass_wrapper::
@@ -225,7 +228,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k},
stream);
m_algo_param.stage, stream);
}
} else {
if (param.format == Format::NCHW4) {
@@ -242,7 +245,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k},
stream);
m_algo_param.stage, stream);
} else if (param.format == Format::NCHW4_NCHW) {
cutlass_wrapper::
do_conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw<true>(
@@ -260,7 +263,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k},
stream);
m_algo_param.stage, stream);

} else {
megdnn_assert(param.format == Format::NCHW4_NCHW32);
@@ -281,7 +284,7 @@ void ConvBiasForwardImpl::AlgoInt8NCHW4DotProdImplicitGemm::exec(
cutlass_wrapper::GemmCoord{m_algo_param.warp_m,
m_algo_param.warp_n,
m_algo_param.warp_k},
stream);
m_algo_param.stage, stream);
}
}
after_kernel_launch();


+ 9
- 7
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_hswish.cu View File

@@ -8,6 +8,7 @@

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutSrc, int32_t, LayoutSrc, int32_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, true>;
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const int8_t* d_src,
const int8_t* d_filter,
const int32_t* d_bias,
const int8_t* d_z,
int8_t* d_dst,
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,


+ 9
- 7
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_id.cu View File

@@ -8,6 +8,7 @@

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutSrc, int32_t, LayoutSrc, int32_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, true>;
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const int8_t* d_src,
const int8_t* d_filter,
const int32_t* d_bias,
const int8_t* d_z,
int8_t* d_dst,
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,


+ 9
- 7
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_16x128x16_16x128x16_relu.cu View File

@@ -8,6 +8,7 @@

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutSrc, int32_t, LayoutSrc, int32_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, true>;
1, 4, 8, true,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const int8_t* d_src,
const int8_t* d_filter,
const int32_t* d_bias,
const int8_t* d_z,
int8_t* d_dst,
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,


+ 9
- 7
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_hswish.cu View File

@@ -8,6 +8,7 @@

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwishClam
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutSrc, int32_t, LayoutSrc, int32_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, false>;
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const int8_t* d_src,
const int8_t* d_filter,
const int32_t* d_bias,
const int8_t* d_z,
int8_t* d_dst,
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,


+ 9
- 7
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_id.cu View File

@@ -8,6 +8,7 @@

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutSrc, int32_t, LayoutSrc, int32_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, false>;
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const int8_t* d_src,
const int8_t* d_filter,
const int32_t* d_bias,
const int8_t* d_z,
int8_t* d_dst,
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,


+ 9
- 7
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_1x1_16x128x16_16x128x16_relu.cu View File

@@ -8,6 +8,7 @@

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCxHWx<4>;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
@@ -15,18 +16,19 @@ using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationReluClamp<
int8_t, 4, int32_t, int32_t, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, int8_t,
LayoutSrc, int32_t, LayoutSrc, int32_t,
LayoutDst, int32_t, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, false>;
1, 4, 8, false,
cutlass::arch::OpMultiplyAddSaturate>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const int8_t* d_src,
const int8_t* d_filter,
const int32_t* d_bias,
const int8_t* d_z,
int8_t* d_dst,
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,


+ 37
- 0
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_hswish.cu View File

@@ -0,0 +1,37 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCHW;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float, 1, int32_t, float, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, float,
LayoutDst, float, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, true,
cutlass::arch::OpMultiplyAdd>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
#pragma GCC diagnostic pop
#endif

+ 37
- 0
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_id.cu View File

@@ -0,0 +1,37 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCHW;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
float, 1, int32_t, float, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, float,
LayoutDst, float, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, true,
cutlass::arch::OpMultiplyAdd>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
#pragma GCC diagnostic pop
#endif

+ 37
- 0
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_16x128x16_16x128x16_relu.cu View File

@@ -0,0 +1,37 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCHW;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu<
float, 1, int32_t, float, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, float,
LayoutDst, float, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, true,
cutlass::arch::OpMultiplyAdd>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
#pragma GCC diagnostic pop
#endif

+ 37
- 0
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_hswish.cu View File

@@ -0,0 +1,37 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCHW;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationHSwish<
float, 1, int32_t, float, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, float,
LayoutDst, float, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, false,
cutlass::arch::OpMultiplyAdd>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
#pragma GCC diagnostic pop
#endif

+ 37
- 0
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_id.cu View File

@@ -0,0 +1,37 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCHW;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombination<
float, 1, int32_t, float, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, float,
LayoutDst, float, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, false,
cutlass::arch::OpMultiplyAdd>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
#pragma GCC diagnostic pop
#endif

+ 37
- 0
dnn/src/cuda/conv_bias/int8/kimpl/conv_bias_int8_implicit_gemm_dp4a_ncdiv4hw4_nchw_1x1_16x128x16_16x128x16_relu.cu View File

@@ -0,0 +1,37 @@
#if !MEGDNN_TEGRA_X1
// generated by gen_cuda_conv_bias_kern_impls.py
// ignore warning of cutlass
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wunused-parameter"
#pragma GCC diagnostic ignored "-Wstrict-aliasing"
#include "src/cuda/conv_bias/int8/conv_bias_int8_implicit_gemm_cutlass_wrapper.cuinl"

using LayoutSrc = cutlass::layout::TensorNCxHWx<4>;
using LayoutFilter = cutlass::layout::TensorCxRSKx<4>;
using LayoutDst = cutlass::layout::TensorNCHW;
using ThreadBlockShape = cutlass::gemm::GemmShape<16, 128, 16>;
using WarpShape = cutlass::gemm::GemmShape<16, 128, 16>;
using InstructionShape = cutlass::gemm::GemmShape<1, 1, 4>;
using EpilogueOp = cutlass::epilogue::thread::BiasAddLinearCombinationRelu<
float, 1, int32_t, float, float>;
using Convolution = cutlass::convolution::device::Convolution<
int8_t, LayoutSrc, int8_t, LayoutFilter, float,
LayoutDst, float, LayoutDst, int32_t,
cutlass::convolution::ConvType::kConvolution, cutlass::arch::OpClassSimt, cutlass::arch::Sm61,
ThreadBlockShape, WarpShape, InstructionShape, EpilogueOp,
cutlass::convolution::threadblock::ConvolutionNCxHWxThreadblockSwizzle<
cutlass::convolution::ConvType::kConvolution>,
1, 4, 8, false,
cutlass::arch::OpMultiplyAdd>;
template void megdnn::cuda::cutlass_wrapper::cutlass_convolution_wrapper<Convolution>(
const typename Convolution::ElementSrc* d_src,
const typename Convolution::ElementFilter* d_filter,
const typename Convolution::ElementBias* d_bias,
const typename Convolution::ElementDst* d_z,
typename Convolution::ElementDst* d_dst,
int* workspace,
typename Convolution::ConvolutionParameter const& conv_param,
typename Convolution::EpilogueOutputOp::Params const& epilogue,
cudaStream_t stream);
#pragma GCC diagnostic pop
#endif

+ 6
- 0
src/gopt/test/inference.cpp View File

@@ -3895,6 +3895,9 @@ TEST(TestGoptInference, FoldingConvDimshuffle) {
.apply({{y}})
.endpoint_vars(),
y_fuse);
gopt::modify_opr_algo_strategy_inplace(
{y_fuse},
opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
graph->compile({{y_fuse, {}}})
->to_json()
->writeto_fpath(output_file(
@@ -3976,6 +3979,9 @@ TEST(TestGoptInference, FoldingConvDimshuffleNCHW4NCHW32) {
.apply({{y}})
.endpoint_vars(),
y_fuse);
gopt::modify_opr_algo_strategy_inplace(
{y_fuse},
opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy::PROFILE);
graph->compile({{y_fuse, {}}})
->to_json()
->writeto_fpath(output_file(


Loading…
Cancel
Save