From fa4883389a4e44c3d10ce077db6785112c8d46aa Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 26 Oct 2022 17:30:02 +0800 Subject: [PATCH] feat(dnn,imperative): remove the restriction of tensor shape when using uint8 region mask GitOrigin-RevId: 37d99006978d756d22111347be2ab2d895f71661 --- .../chanwise/depthwise_large_filter_algo.cuh | 49 +-- .../region_restricted_convolution/opr_impl.cpp | 11 +- dnn/test/cuda/region_restricted_convolution.cpp | 336 ++++++++++++++------- .../python/test/unit/functional/test_functional.py | 4 +- 4 files changed, 258 insertions(+), 142 deletions(-) diff --git a/dnn/src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter_algo.cuh b/dnn/src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter_algo.cuh index c35df2b1..04157196 100644 --- a/dnn/src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter_algo.cuh +++ b/dnn/src/cuda/region_restricted_convolution/chanwise/depthwise_large_filter_algo.cuh @@ -784,20 +784,24 @@ __global__ void DepthwiseConv2dGPUKernelNCHW( static_assert((OutTileConfig::unroll_w & 3) == 0, "output tile unroll_w & 3 != 0"); static_assert((OutTileConfig::block_w & 3) == 0, "output block_w & 3 != 0"); int reg_rout[OutTileConfig::unroll_size] = {0}; + int relative_offset = sizeof(dt_int32) / sizeof(dt_uint8); #pragma unroll for (int i = 0; i < OutTileConfig::unroll_h; ++i) { int out_h_idx = out_base_h_idx + i; if (out_h_idx < param.out_h) { #pragma unroll - for (int j = 0; j < OutTileConfig::unroll_w; j += 4) { + for (int j = 0; j < OutTileConfig::unroll_w; j += relative_offset) { int out_w_idx = out_start_w + j; if (out_w_idx < param.out_w) { - uint32_t val = *(reinterpret_cast( - &rout_base_ptr[out_h_idx * param.out_w + out_w_idx])); - reg_rout[i * OutTileConfig::unroll_w + j] = val & 0xff; - reg_rout[i * OutTileConfig::unroll_w + j + 1] = (val >> 8) & 0xff; - reg_rout[i * OutTileConfig::unroll_w + j + 2] = (val >> 16) & 0xff; - reg_rout[i * OutTileConfig::unroll_w + j + 3] = (val >> 24) & 0xff; + int valid_offset = relative_offset + out_w_idx > param.out_w + ? param.out_w - out_w_idx + : relative_offset; +#pragma unroll + for (int t = 0; t < valid_offset; t += 1) { + uint8_t val = + rout_base_ptr[out_h_idx * param.out_w + out_w_idx + t]; + reg_rout[i * OutTileConfig::unroll_w + j + t] = val & 0xff; + } } } } @@ -855,21 +859,23 @@ __global__ void DepthwiseConv2dGPUKernelNCHW( int s_idx = (off_oh * stride_h + s_h) % SrcTileCount::smem_h * SrcTileCount::smem_w + (off_oh * stride_h + s_h) / SrcTileCount::bank_offset_line; + int r_idx = (off_oh * stride_h + s_h) % RinTileCount::smem_h * + RinTileCount::smem_w + + (off_oh * stride_h + s_h) / RinTileCount::bank_offset_line; #pragma unroll - for (int s_w = 0; s_w < irin_unroll_w; s_w += 4) { - uint32_t val = smem_rin_ptr - [(off_oh * stride_h + s_h) % RinTileCount::smem_h * - RinTileCount::smem_w + - (s_w >> 2) + - (off_oh * stride_h + s_h) / RinTileCount::bank_offset_line]; - reg_src[0][s_h * irin_unroll_w + s_w] = smem_src_ptr[s_idx + s_w]; - reg_src[0][s_h * irin_unroll_w + s_w + 1] = smem_src_ptr[s_idx + s_w + 1]; - reg_src[0][s_h * irin_unroll_w + s_w + 2] = smem_src_ptr[s_idx + s_w + 2]; - reg_src[0][s_h * irin_unroll_w + s_w + 3] = smem_src_ptr[s_idx + s_w + 3]; - reg_rin[0][s_h * irin_unroll_w + s_w] = val & 0xff; - reg_rin[0][s_h * irin_unroll_w + s_w + 1] = (val >> 8) & 0xff; - reg_rin[0][s_h * irin_unroll_w + s_w + 2] = (val >> 16) & 0xff; - reg_rin[0][s_h * irin_unroll_w + s_w + 3] = (val >> 24) & 0xff; + for (int s_w = 0; s_w < SrcTileConfig::unroll_w; ++s_w) { + reg_src[0][s_h * SrcTileConfig::unroll_w + s_w] = smem_src_ptr[s_idx + s_w]; + } +#pragma unroll + for (int s_w = 0; s_w < irin_unroll_w; s_w += relative_offset) { + reg_rin[0][s_h * irin_unroll_w + s_w] = + (smem_rin_ptr[r_idx + (s_w >> 2)]) & 0xff; + reg_rin[0][s_h * irin_unroll_w + s_w + 1] = + (smem_rin_ptr[r_idx + (s_w >> 2)] >> 8) & 0xff; + reg_rin[0][s_h * irin_unroll_w + s_w + 2] = + (smem_rin_ptr[r_idx + (s_w >> 2)] >> 16) & 0xff; + reg_rin[0][s_h * irin_unroll_w + s_w + 3] = + (smem_rin_ptr[r_idx + (s_w >> 2)] >> 24) & 0xff; } } @@ -1108,6 +1114,7 @@ void LaunchDepthwiseConv2dGPU( if (param.is_compute_deafult) { kernel = DepthwiseConv2dGPUKernelNCHW; } else { + printf("expected dnn param compute default mode\n"); megdnn_assert_internal(0); } if (is_fwd) { diff --git a/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp b/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp index ae06f3aa..aabffde9 100644 --- a/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp +++ b/dnn/src/cuda/region_restricted_convolution/opr_impl.cpp @@ -25,9 +25,6 @@ void RegionRestrictedConvolutionForwardImpl::exec( fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 && fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip && param().stride_h == 1 && param().stride_w == 1); - if (rin.layout.dtype == dtype::Uint8()) { - megdnn_assert((src.layout.shape[3] & 3) == 0 && (dst.layout.shape[3] & 3) == 0); - } auto stream = cuda_stream(handle()); @@ -43,6 +40,7 @@ void RegionRestrictedConvolutionForwardImpl::exec( dst.ptr(), src.ptr(), filter.ptr(), rin.ptr(), rout.ptr(), kparam, stream); } else { + printf("unexpected region restricted conv mode\n"); megdnn_assert_internal(0); } } @@ -81,11 +79,6 @@ void RegionRestrictedConvolutionBackwardDataImpl::exec( fm.spatial_ndim == 2 && fm.icpg == 1 && fm.ocpg == 1 && fm.dilation[0] == 1 && fm.dilation[1] == 1 && !fm.should_flip && param().stride_h == 1 && param().stride_w == 1); - // NOTE: uint8 dtype region mask requires the spatial size of src&dst is 4*N - if (rin.layout.dtype == dtype::Uint8()) { - megdnn_assert( - (grad.layout.shape[3] & 3) == 0 && (diff.layout.shape[3] & 3) == 0); - } auto stream = cuda_stream(handle()); if (filter.layout.dtype == dtype::Float32() && rin.layout.dtype == dtype::Int32() && rout.layout.dtype == dtype::Int32()) { @@ -135,8 +128,6 @@ void RegionRestrictedConvolutionBackwardFilterImpl::exec( int ph = fm.padding[0], pw = fm.padding[1]; int dh = 0, dw = 0; - // check if channelwise convolution - megdnn_assert(fm.icpg == 1 && fm.ocpg == 1); auto stream = cuda_stream(handle()); float alpha = 1.f; diff --git a/dnn/test/cuda/region_restricted_convolution.cpp b/dnn/test/cuda/region_restricted_convolution.cpp index 4a09e048..db10790c 100644 --- a/dnn/test/cuda/region_restricted_convolution.cpp +++ b/dnn/test/cuda/region_restricted_convolution.cpp @@ -1,6 +1,7 @@ #include "megdnn/dtype.h" #include "megdnn/opr_param_defs.h" #include "megdnn/oprs.h" +#include "megdnn/oprs/nn.h" #include "test/common/checker.h" #include "test/common/conv_bias.h" #include "test/common/rng.h" @@ -11,6 +12,7 @@ #include "test/cuda/utils.h" #include +#include #define V1(x) #x #define V(x) V1(x) @@ -37,9 +39,6 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) { UniformIntRNG r_rng{0, 2}; checker.set_rng(0, &rng).set_rng(1, &rng).set_rng(2, &r_rng).set_rng( 3, &r_rng); - if (dt.enumv() == DTypeEnum::Float16) { - checker.set_epsilon(1e-1); - } cur_param.pad_h = cur_param.pad_w = padding; cur_param.stride_h = cur_param.stride_w = stride; @@ -49,11 +48,32 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) { checker.set_param(cur_param).execs( {{n, g, h, h}, {g, 1, 1, fh, fh}, {n, h, h}, {n, ho, ho}, {}}); }; + run(1, 1, 3, 2, 1, 1); + run(1, 1, 5, 2, 1, 1); + run(1, 1, 6, 2, 1, 1); + run(1, 1, 7, 2, 1, 1); + run(1, 1, 9, 2, 1, 1); + run(1, 1, 10, 2, 1, 1); + run(1, 1, 11, 2, 1, 1); + run(1, 1, 13, 2, 1, 1); + run(1, 1, 14, 2, 1, 1); + run(1, 1, 15, 2, 1, 1); + run(1, 1, 17, 2, 1, 1); + run(1, 1, 18, 2, 1, 1); + run(1, 1, 19, 2, 1, 1); + run(1, 1, 21, 2, 1, 1); + run(1, 1, 22, 2, 1, 1); + run(1, 1, 23, 2, 1, 1); + run(1, 1, 25, 2, 1, 1); + run(1, 1, 26, 2, 1, 1); + run(1, 1, 27, 2, 1, 1); + run(1, 1, 29, 2, 1, 1); + run(1, 1, 30, 2, 1, 1); + run(1, 1, 31, 2, 1, 1); run(4, 8, 32, 3, 3 / 2, 1); run(4, 8, 32, 5, 5 / 2, 1); run(4, 8, 32, 7, 7 / 2, 1); - run(1, 2, 32, 9, 9 / 2, 1); - run(4, 1, 32, 9, 9 / 2, 1); + run(4, 8, 32, 9, 9 / 2, 1); run(4, 8, 32, 11, 11 / 2, 1); run(4, 8, 32, 13, 13 / 2, 1); run(4, 8, 32, 15, 15 / 2, 1); @@ -65,12 +85,27 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER) { run(4, 8, 32, 27, 27 / 2, 1); run(4, 8, 32, 29, 29 / 2, 1); run(4, 8, 32, 31, 31 / 2, 1); + run(4, 8, 31, 3, 3 / 2, 1); + run(4, 8, 31, 5, 5 / 2, 1); + run(4, 8, 31, 7, 7 / 2, 1); + run(4, 8, 31, 9, 9 / 2, 1); + run(4, 8, 31, 11, 11 / 2, 1); + run(4, 8, 31, 13, 13 / 2, 1); + run(4, 8, 31, 15, 15 / 2, 1); + run(4, 8, 31, 17, 17 / 2, 1); + run(4, 8, 31, 19, 19 / 2, 1); + run(4, 8, 31, 21, 21 / 2, 1); + run(4, 8, 31, 23, 23 / 2, 1); + run(4, 8, 31, 25, 25 / 2, 1); + run(4, 8, 31, 27, 27 / 2, 1); + run(4, 8, 31, 29, 29 / 2, 1); + run(4, 8, 31, 31, 31 / 2, 1); } } #if MEGDNN_WITH_BENCHMARK -TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) { +TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32_INT32) { require_compute_capability(7, 5); Benchmarker bencher(handle_cuda()); bencher.set_display(false); @@ -153,24 +188,24 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_FP32) { time_in_ms / rr_time_in_ms); }; - run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10); - run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10); - run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10); - run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10); - run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10); - run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10); - run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10); - run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10); - run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10); - run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10); - run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10); - run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10); - run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10); - run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10); - run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); + run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); + run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); + run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); + run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); + run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); + run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); + run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); + run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); + run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); + run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); + run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); + run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); + run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); + run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); + run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); } -TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32) { +TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_DATA_FP32_INT32) { require_compute_capability(7, 5); Benchmarker bencher(handle_cuda()); bencher.set_display(false); @@ -251,24 +286,24 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32) { time_in_ms / rr_time_in_ms); }; - run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10); - run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10); - run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10); - run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10); - run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10); - run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10); - run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10); - run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10); - run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10); - run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10); - run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10); - run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10); - run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10); - run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10); - run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); + run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); + run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); + run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); + run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); + run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); + run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); + run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); + run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); + run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); + run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); + run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); + run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); + run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); + run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); + run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); } -TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32_UINT8) { +TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_DATA_FP32_UINT8) { require_compute_capability(7, 5); Benchmarker bencher(handle_cuda()); bencher.set_display(false); @@ -349,21 +384,36 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_LARGE_FILTER_FP32_UINT8) time_in_ms / rr_time_in_ms); }; - run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10); - run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10); - run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10); - run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10); - run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10); - run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10); - run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10); - run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10); - run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10); - run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10); - run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10); - run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10); - run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10); - run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10); - run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); + run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); + run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); + run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); + run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); + run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); + run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); + run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); + run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); + run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); + run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); + run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); + run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); + run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); + run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); + run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); + run_bench(64, 384, 31, 31, 3, 3, 1, 1, 1000); + run_bench(64, 384, 31, 31, 5, 5, 1, 1, 1000); + run_bench(64, 384, 31, 31, 7, 7, 1, 1, 1000); + run_bench(64, 384, 31, 31, 9, 9, 1, 1, 1000); + run_bench(64, 384, 31, 31, 11, 11, 1, 1, 1000); + run_bench(64, 384, 31, 31, 13, 13, 1, 1, 1000); + run_bench(64, 384, 31, 31, 15, 15, 1, 1, 1000); + run_bench(64, 384, 31, 31, 17, 17, 1, 1, 1000); + run_bench(64, 384, 31, 31, 19, 19, 1, 1, 1000); + run_bench(64, 384, 31, 31, 21, 21, 1, 1, 1000); + run_bench(64, 384, 31, 31, 23, 23, 1, 1, 1000); + run_bench(64, 384, 31, 31, 25, 25, 1, 1, 1000); + run_bench(64, 384, 31, 31, 27, 27, 1, 1, 1000); + run_bench(64, 384, 31, 31, 29, 29, 1, 1, 1000); + run_bench(64, 384, 31, 31, 31, 31, 1, 1, 1000); } TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) { @@ -449,21 +499,36 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_FORWARD_LARGE_FILTER_UINT8) { time_in_ms / rr_time_in_ms); }; - run_bench(64, 384, 32, 32, 3, 3, 1, 1, 10); - run_bench(64, 384, 32, 32, 5, 5, 1, 1, 10); - run_bench(64, 384, 32, 32, 7, 7, 1, 1, 10); - run_bench(64, 384, 32, 32, 9, 9, 1, 1, 10); - run_bench(64, 384, 32, 32, 11, 11, 1, 1, 10); - run_bench(64, 384, 32, 32, 13, 13, 1, 1, 10); - run_bench(64, 384, 32, 32, 15, 15, 1, 1, 10); - run_bench(64, 384, 32, 32, 17, 17, 1, 1, 10); - run_bench(64, 384, 32, 32, 19, 19, 1, 1, 10); - run_bench(64, 384, 32, 32, 21, 21, 1, 1, 10); - run_bench(64, 384, 32, 32, 23, 23, 1, 1, 10); - run_bench(64, 384, 32, 32, 25, 25, 1, 1, 10); - run_bench(64, 384, 32, 32, 27, 27, 1, 1, 10); - run_bench(64, 384, 32, 32, 29, 29, 1, 1, 10); - run_bench(64, 384, 32, 32, 31, 31, 1, 1, 10); + run_bench(64, 384, 32, 32, 3, 3, 1, 1, 1000); + run_bench(64, 384, 32, 32, 5, 5, 1, 1, 1000); + run_bench(64, 384, 32, 32, 7, 7, 1, 1, 1000); + run_bench(64, 384, 32, 32, 9, 9, 1, 1, 1000); + run_bench(64, 384, 32, 32, 11, 11, 1, 1, 1000); + run_bench(64, 384, 32, 32, 13, 13, 1, 1, 1000); + run_bench(64, 384, 32, 32, 15, 15, 1, 1, 1000); + run_bench(64, 384, 32, 32, 17, 17, 1, 1, 1000); + run_bench(64, 384, 32, 32, 19, 19, 1, 1, 1000); + run_bench(64, 384, 32, 32, 21, 21, 1, 1, 1000); + run_bench(64, 384, 32, 32, 23, 23, 1, 1, 1000); + run_bench(64, 384, 32, 32, 25, 25, 1, 1, 1000); + run_bench(64, 384, 32, 32, 27, 27, 1, 1, 1000); + run_bench(64, 384, 32, 32, 29, 29, 1, 1, 1000); + run_bench(64, 384, 32, 32, 31, 31, 1, 1, 1000); + run_bench(64, 384, 31, 31, 3, 3, 1, 1, 1000); + run_bench(64, 384, 31, 31, 5, 5, 1, 1, 1000); + run_bench(64, 384, 31, 31, 7, 7, 1, 1, 1000); + run_bench(64, 384, 31, 31, 9, 9, 1, 1, 1000); + run_bench(64, 384, 31, 31, 11, 11, 1, 1, 1000); + run_bench(64, 384, 31, 31, 13, 13, 1, 1, 1000); + run_bench(64, 384, 31, 31, 15, 15, 1, 1, 1000); + run_bench(64, 384, 31, 31, 17, 17, 1, 1, 1000); + run_bench(64, 384, 31, 31, 19, 19, 1, 1, 1000); + run_bench(64, 384, 31, 31, 21, 21, 1, 1, 1000); + run_bench(64, 384, 31, 31, 23, 23, 1, 1, 1000); + run_bench(64, 384, 31, 31, 25, 25, 1, 1, 1000); + run_bench(64, 384, 31, 31, 27, 27, 1, 1, 1000); + run_bench(64, 384, 31, 31, 29, 29, 1, 1, 1000); + run_bench(64, 384, 31, 31, 31, 31, 1, 1, 1000); } TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) { @@ -538,7 +603,7 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32) { auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times; auto rr_ops = 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12; - printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " + printf("[WGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " "src=%s, " "diff=%s, grad=%s\n" "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n" @@ -638,7 +703,7 @@ TEST_F(CUDA, BENCHMARK_REGION_RESTRICTED_CONV_BACKWARD_FILTER_FP32_RINT8) { auto rr_time_in_ms = rr_bencher.execs({src, diff, rin, rout, grad}) / nr_times; auto rr_ops = 2.0 * batch * g * hi * wi * fh * fw / (rr_time_in_ms * 1e-3) * 1e-12; - printf("[DGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " + printf("[WGRAD]RegionRestrictedDepthwiseLargeFilter vs DepthwiseLargeFilter: " "src=%s, " "diff=%s, grad=%s\n" "time: %.2f ms, time(rr): %.2f ms, perf: %.2fTops, perf(rr): %.2f Tops\n" @@ -703,15 +768,30 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) { {n, g * 1, ih, ih} // grad }); }; - if (dt == dtype::Int32()) { - run(4, 8, 32, 5, 5 / 2, 1); - run(1, 2, 2, 2, 0, 1); - run(1, 2, 3, 3, 0, 1); - run(1, 2, 4, 4, 0, 1); - run(1, 2, 5, 5, 0, 1); - run(1, 2, 6, 6, 0, 1); - run(1, 2, 7, 7, 0, 1); - } + run(1, 1, 3, 2, 1, 1); + run(1, 1, 5, 2, 1, 1); + run(1, 1, 6, 2, 1, 1); + run(1, 1, 7, 2, 1, 1); + run(1, 1, 9, 2, 1, 1); + run(1, 1, 10, 2, 1, 1); + run(1, 1, 11, 2, 1, 1); + run(1, 1, 13, 2, 1, 1); + run(1, 1, 14, 2, 1, 1); + run(1, 1, 15, 2, 1, 1); + run(1, 1, 17, 2, 1, 1); + run(1, 1, 18, 2, 1, 1); + run(1, 1, 19, 2, 1, 1); + run(1, 1, 21, 2, 1, 1); + run(1, 1, 22, 2, 1, 1); + run(1, 1, 23, 2, 1, 1); + run(1, 1, 25, 2, 1, 1); + run(1, 1, 26, 2, 1, 1); + run(1, 1, 27, 2, 1, 1); + run(1, 1, 29, 2, 1, 1); + run(1, 1, 30, 2, 1, 1); + run(1, 1, 31, 2, 1, 1); + run(4, 8, 32, 3, 3 / 2, 1); + run(4, 8, 32, 5, 5 / 2, 1); run(4, 8, 32, 7, 7 / 2, 1); run(4, 8, 32, 9, 9 / 2, 1); run(4, 8, 32, 11, 11 / 2, 1); @@ -724,8 +804,22 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32) { run(4, 8, 32, 25, 25 / 2, 1); run(4, 8, 32, 27, 27 / 2, 1); run(4, 8, 32, 29, 29 / 2, 1); - run(4, 1, 32, 29, 29 / 2, 1); run(4, 8, 32, 31, 31 / 2, 1); + run(4, 8, 31, 3, 3 / 2, 1); + run(4, 8, 31, 5, 5 / 2, 1); + run(4, 8, 31, 7, 7 / 2, 1); + run(4, 8, 31, 9, 9 / 2, 1); + run(4, 8, 31, 11, 11 / 2, 1); + run(4, 8, 31, 13, 13 / 2, 1); + run(4, 8, 31, 15, 15 / 2, 1); + run(4, 8, 31, 17, 17 / 2, 1); + run(4, 8, 31, 19, 19 / 2, 1); + run(4, 8, 31, 21, 21 / 2, 1); + run(4, 8, 31, 23, 23 / 2, 1); + run(4, 8, 31, 25, 25 / 2, 1); + run(4, 8, 31, 27, 27 / 2, 1); + run(4, 8, 31, 29, 29 / 2, 1); + run(4, 8, 31, 31, 31 / 2, 1); } } @@ -761,16 +855,30 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) { /*rout*/ {n, oh, oh}, /*grad*/ {n, g * 1, ih, ih}}); }; - if (dt == dtype::Int32()) { - // NOTE: UINT8 assert the spatial size of src&dst is 4*N - run(4, 8, 32, 5, 5 / 2, 1); - run(1, 2, 2, 2, 0, 1); - run(1, 2, 3, 3, 0, 1); - run(1, 2, 4, 4, 0, 1); - run(1, 2, 5, 5, 0, 1); - run(1, 2, 6, 6, 0, 1); - run(1, 2, 7, 7, 0, 1); - } + run(1, 1, 3, 2, 1, 1); + run(1, 1, 5, 2, 1, 1); + run(1, 1, 6, 2, 1, 1); + run(1, 1, 7, 2, 1, 1); + run(1, 1, 9, 2, 1, 1); + run(1, 1, 10, 2, 1, 1); + run(1, 1, 11, 2, 1, 1); + run(1, 1, 13, 2, 1, 1); + run(1, 1, 14, 2, 1, 1); + run(1, 1, 15, 2, 1, 1); + run(1, 1, 17, 2, 1, 1); + run(1, 1, 18, 2, 1, 1); + run(1, 1, 19, 2, 1, 1); + run(1, 1, 21, 2, 1, 1); + run(1, 1, 22, 2, 1, 1); + run(1, 1, 23, 2, 1, 1); + run(1, 1, 25, 2, 1, 1); + run(1, 1, 26, 2, 1, 1); + run(1, 1, 27, 2, 1, 1); + run(1, 1, 29, 2, 1, 1); + run(1, 1, 30, 2, 1, 1); + run(1, 1, 31, 2, 1, 1); + run(4, 8, 32, 3, 3 / 2, 1); + run(4, 8, 32, 5, 5 / 2, 1); run(4, 8, 32, 7, 7 / 2, 1); run(4, 8, 32, 9, 9 / 2, 1); run(4, 8, 32, 11, 11 / 2, 1); @@ -781,10 +889,24 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_DATA_FP32_RIN_EQ_ROUT) { run(4, 8, 32, 21, 21 / 2, 1); run(4, 8, 32, 23, 23 / 2, 1); run(4, 8, 32, 25, 25 / 2, 1); - run(4, 1, 32, 25, 25 / 2, 1); run(4, 8, 32, 27, 27 / 2, 1); run(4, 8, 32, 29, 29 / 2, 1); run(4, 8, 32, 31, 31 / 2, 1); + run(4, 8, 31, 3, 3 / 2, 1); + run(4, 8, 31, 5, 5 / 2, 1); + run(4, 8, 31, 7, 7 / 2, 1); + run(4, 8, 31, 9, 9 / 2, 1); + run(4, 8, 31, 11, 11 / 2, 1); + run(4, 8, 31, 13, 13 / 2, 1); + run(4, 8, 31, 15, 15 / 2, 1); + run(4, 8, 31, 17, 17 / 2, 1); + run(4, 8, 31, 19, 19 / 2, 1); + run(4, 8, 31, 21, 21 / 2, 1); + run(4, 8, 31, 23, 23 / 2, 1); + run(4, 8, 31, 25, 25 / 2, 1); + run(4, 8, 31, 27, 27 / 2, 1); + run(4, 8, 31, 29, 29 / 2, 1); + run(4, 8, 31, 31, 31 / 2, 1); } } @@ -824,15 +946,13 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32) { {g, 1, 1, fh, fh} // grad }); }; - if (dt == dtype::Int32()) { - run(4, 8, 32, 5, 5 / 2, 1); - run(1, 2, 2, 2, 0, 1); - run(1, 2, 3, 3, 0, 1); - run(1, 2, 4, 4, 0, 1); - run(1, 2, 5, 5, 0, 1); - run(1, 2, 6, 6, 0, 1); - run(1, 2, 7, 7, 0, 1); - } + run(4, 8, 32, 5, 5 / 2, 1); + run(1, 2, 2, 2, 0, 1); + run(1, 2, 3, 3, 0, 1); + run(1, 2, 4, 4, 0, 1); + run(1, 2, 5, 5, 0, 1); + run(1, 2, 6, 6, 0, 1); + run(1, 2, 7, 7, 0, 1); run(4, 8, 32, 7, 7 / 2, 1); run(4, 8, 32, 9, 9 / 2, 1); run(4, 8, 32, 11, 11 / 2, 1); @@ -886,15 +1006,13 @@ TEST_F(CUDA, REGION_RESTRICTED_CONV_BWD_FILTER_FP32_RIN_EQ_ROUT) { {g, 1, 1, fh, fh} // grad }); }; - if (dt == dtype::Int32()) { - run(4, 8, 32, 5, 5 / 2, 1); - run(1, 2, 2, 2, 0, 1); - run(1, 2, 3, 3, 0, 1); - run(1, 2, 4, 4, 0, 1); - run(1, 2, 5, 5, 0, 1); - run(1, 2, 6, 6, 0, 1); - run(1, 2, 7, 7, 0, 1); - } + run(4, 8, 32, 5, 5 / 2, 1); + run(1, 2, 2, 2, 0, 1); + run(1, 2, 3, 3, 0, 1); + run(1, 2, 4, 4, 0, 1); + run(1, 2, 5, 5, 0, 1); + run(1, 2, 6, 6, 0, 1); + run(1, 2, 7, 7, 0, 1); run(4, 8, 32, 7, 7 / 2, 1); run(4, 8, 32, 9, 9 / 2, 1); run(4, 8, 32, 11, 11 / 2, 1); diff --git a/imperative/python/test/unit/functional/test_functional.py b/imperative/python/test/unit/functional/test_functional.py index d2c2e8e6..5b2d7941 100644 --- a/imperative/python/test/unit/functional/test_functional.py +++ b/imperative/python/test/unit/functional/test_functional.py @@ -1061,8 +1061,8 @@ def test_region_restricted_conv_forward_backward_uint8(bias, groups): N = 1 GROUP = groups FH = FW = 1 - IH = IW = 4 - OH = OW = 4 + IH = IW = 3 + OH = OW = 3 ICPG = OCPG = 1 grad_shape = (N, GROUP * ICPG, IH, IW) src_shape = grad_shape