From cdbe44f8b43dc546893c5cec1a3df6460eef5d51 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Thu, 18 Jun 2020 19:40:08 +0800 Subject: [PATCH] feat(dnn): add gemv supports in conv1x1 with format NCHW GitOrigin-RevId: 97679e85265dd8aa18ed022f3e02f240a796f79e --- dnn/src/arm_common/conv_bias/postprocess_helper.h | 10 +- dnn/src/arm_common/matrix_mul/algos.cpp | 64 +-- dnn/src/arm_common/matrix_mul/fp16/hgemv.cpp | 10 +- dnn/src/arm_common/matrix_mul/fp16/hgemv.h | 9 +- dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.cpp | 40 +- dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.h | 7 +- dnn/src/arm_common/matrix_mul/int8/gemv.cpp | 22 +- dnn/src/arm_common/matrix_mul/int8/gemv.h | 9 +- dnn/src/fallback/conv_bias/conv1x1/algos.cpp | 43 +- dnn/src/fallback/conv_bias/conv1x1/algos.h | 4 +- .../conv_bias/conv1x1/algos_conv1x1_gemv.cpp | 448 +++++++++++++++++++++ .../conv_bias/conv1x1/algos_conv1x1_gemv.h | 47 +++ .../conv_bias/conv1x1/conv1x1_dispatcher.h | 24 +- .../conv_bias/conv1x1/conv1x1_strategy.cpp | 79 +--- .../fallback/conv_bias/conv1x1/conv1x1_strategy.h | 48 +-- .../fallback/conv_bias/conv1x1/conv1x1_utils.cpp | 75 ++++ dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h | 102 +++++ dnn/src/fallback/conv_bias/im2col/algos.cpp | 7 +- dnn/src/fallback/conv_bias/opr_impl.cpp | 5 + dnn/src/fallback/conv_bias/opr_impl.h | 1 + dnn/src/fallback/matrix_mul/algos.cpp | 34 +- dnn/src/fallback/matrix_mul/gemv.h | 71 ++++ dnn/test/arm_common/conv_bias_multi_thread.cpp | 102 ++++- dnn/test/arm_common/matrix_mul.cpp | 45 +++ dnn/test/common/conv_bias.cpp | 2 +- 25 files changed, 1006 insertions(+), 302 deletions(-) create mode 100644 dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp create mode 100644 dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp create mode 100644 dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h create mode 100644 dnn/src/fallback/matrix_mul/gemv.h diff --git a/dnn/src/arm_common/conv_bias/postprocess_helper.h b/dnn/src/arm_common/conv_bias/postprocess_helper.h index 4e4cbd42..6fdb18b7 100644 --- a/dnn/src/arm_common/conv_bias/postprocess_helper.h +++ b/dnn/src/arm_common/conv_bias/postprocess_helper.h @@ -193,7 +193,7 @@ struct PostProcess { DEFAULT \ } -#define FOR_BIAS(_bias_mode) \ +#define FOR_BIAS(_bias_mode, OH, OW) \ switch (_bias_mode) { \ case megdnn::BiasMode::NO_BIAS: \ FOR_NONLINEAR_NOBIAS(FOR_NONLINEAR_UNARY); \ @@ -208,6 +208,10 @@ struct PostProcess { } \ break; \ default: \ + if (OH * OW == 1) { \ + FOR_NONLINEAR(FOR_NONLINEAR_BINARY_BROADCAST); \ + break; \ + } \ megdnn_throw("quantized unsupported biasmode"); \ break; \ } @@ -218,7 +222,9 @@ struct PostProcess { megdnn::BiasMode bias_mode, megdnn::NonlineMode nonlineMode, megdnn::DType bias_type, megdnn::DType dst_type, size_t N, size_t OC, size_t OH, size_t OW, size_t pack_oc_size = 1) { - FOR_BIAS(bias_mode); + //! when OH * OW = 1, the bias_mode will be BiasMode::BIAS. It is wrong, + //! we deal this case at default branch. + FOR_BIAS(bias_mode, OH, OW); } }; diff --git a/dnn/src/arm_common/matrix_mul/algos.cpp b/dnn/src/arm_common/matrix_mul/algos.cpp index 6e4ffe93..b5f03e47 100644 --- a/dnn/src/arm_common/matrix_mul/algos.cpp +++ b/dnn/src/arm_common/matrix_mul/algos.cpp @@ -43,9 +43,9 @@ void exec_int_8x8x16(const MatrixMulImpl::KernParam& kern_param) { size_t N = kern_param.N; size_t K = kern_param.K; size_t LDB = kern_param.LDB; - exec_gemm_int8_int8_int16(kern_param.A(), - kern_param.B(), - kern_param.C(), M, K, N, LDB, w0, w1); + exec_gemm_int8_int8_int16( + kern_param.A(), kern_param.B(), + kern_param.C(), M, K, N, LDB, w0, w1); } MIDOUT_END(); } @@ -79,8 +79,7 @@ void int8x8x32_gemv_kern(const MatrixMulImpl::KernParam& kern_param) { auto LDA = kern_param.LDA, LDB = kern_param.LDB, LDC = kern_param.LDC; const auto Aptr = kern_param.A(), Bptr = kern_param.B(); auto Cptr = kern_param.C(); - arm_common::matmul::gemv_like_int8(Aptr, Bptr, Cptr, M, N, K, LDA, LDB, - LDC); + gemv_like(Aptr, Bptr, Cptr, M, N, K, LDA, LDB, LDC); } } // anonymous namespace @@ -110,7 +109,7 @@ void f32_gemv_kern(const MatrixMulImpl::KernParam& kern_param) { const auto Aptr = kern_param.A(), Bptr = kern_param.B(); auto Cptr = kern_param.C(); - arm_common::sgemm_sgemv_like(Aptr, Bptr, Cptr, M, N, K, LDA, LDB, LDC); + gemv_like(Aptr, Bptr, Cptr, M, N, K, LDA, LDB, LDC); } } // anonymous namespace @@ -140,25 +139,14 @@ MatrixMulImpl::kern_t MatrixMulImpl::AlgoF32Gemv::get_kern( /* ===================== F32 Gevm algo ===================== */ namespace { - -void gevm_fp32_kern(const MatrixMulImpl::KernParam& kern_param) { - auto M = kern_param.M, N = kern_param.N, K = kern_param.K; - auto LDB = kern_param.LDB; - const auto Aptr = kern_param.A(), - Bptr = kern_param.B(); - auto Cptr = kern_param.C(); - arm_common::sgemm_sgemv_like(Bptr, Aptr, Cptr, N, M, K, LDB, 1, 1); -} - -void gevm_int8_kern(const MatrixMulImpl::KernParam& kern_param) { +template +void gevm_like_kern(const MatrixMulImpl::KernParam& kern_param) { auto M = kern_param.M, N = kern_param.N, K = kern_param.K; auto LDB = kern_param.LDB; - const auto Aptr = kern_param.A(), - Bptr = kern_param.B(); - auto Cptr = kern_param.C(); - arm_common::matmul::gemv_like_int8(Bptr, Aptr, Cptr, N, M, K, LDB, 1, 1); + const auto Aptr = kern_param.A(), Bptr = kern_param.B(); + auto Cptr = kern_param.C(); + megdnn::arm_common::gemv_like(Bptr, Aptr, Cptr, N, M, K, LDB, 1, 1); } - } // anonymous namespace bool MatrixMulImpl::AlgoGevm::usable( @@ -170,8 +158,16 @@ bool MatrixMulImpl::AlgoGevm::usable( kern_size_param.B_type == kern_size_param.A_type && kern_size_param.C_type == kern_size_param.A_type && kern_size_param.A_type == dtype::Float32(); - return (fp32_ok || can_be_treated_as_int8x8x32(kern_size_param)) && - preferred(kern_size_param); + bool fp16_ok = false; +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + fp16_ok = kern_size_param.compute_mode == Param::ComputeMode::DEFAULT && + kern_size_param.format == param::MatrixMul::Format::DEFAULT && + kern_size_param.B_type == kern_size_param.A_type && + kern_size_param.C_type == kern_size_param.A_type && + kern_size_param.A_type == dtype::Float16(); +#endif + bool int8_ok = can_be_treated_as_int8x8x32(kern_size_param); + return (fp32_ok || fp16_ok || int8_ok) && preferred(kern_size_param); } bool MatrixMulImpl::AlgoGevm::preferred( @@ -183,11 +179,17 @@ bool MatrixMulImpl::AlgoGevm::preferred( MatrixMulImpl::kern_t MatrixMulImpl::AlgoGevm::get_kern( const KernSizeParam& kern_size_param) const { if (kern_size_param.A_type == dtype::Float32()) { - return gevm_fp32_kern; + return gevm_like_kern; } else if (kern_size_param.A_type.enumv() == DTypeEnum::Int8 || kern_size_param.A_type.enumv() == DTypeEnum::QuantizedS8) { - return gevm_int8_kern; - } else { + return gevm_like_kern; + } +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + else if (kern_size_param.A_type == dtype::Float16()) { + return gevm_like_kern<__fp16, __fp16>; + } +#endif + else { megdnn_assert( false, "no avaliable kern got A_type: %s B_type: %s C_type: %s", kern_size_param.A_type.name(), kern_size_param.B_type.name(), @@ -205,10 +207,10 @@ void f16_gemv_kern(const MatrixMulImpl::KernParam& kern_param) { Bptr = kern_param.B(); auto Cptr = kern_param.C(); MIDOUT_BEGIN(megdnn_arm_hgemv, void) { - arm_common::hgemv_exec(reinterpret_cast(Aptr), - reinterpret_cast(Bptr), - reinterpret_cast<__fp16*>(Cptr), M, N, K, LDA, - LDB, LDC); + arm_common::gemv_like(reinterpret_cast(Aptr), + reinterpret_cast(Bptr), + reinterpret_cast<__fp16*>(Cptr), M, N, K, LDA, + LDB, LDC); } MIDOUT_END(); } diff --git a/dnn/src/arm_common/matrix_mul/fp16/hgemv.cpp b/dnn/src/arm_common/matrix_mul/fp16/hgemv.cpp index 38b58165..0a9e5db2 100644 --- a/dnn/src/arm_common/matrix_mul/fp16/hgemv.cpp +++ b/dnn/src/arm_common/matrix_mul/fp16/hgemv.cpp @@ -96,11 +96,11 @@ void hgemv_naive_n(const __fp16* __restrict A, const __fp16* __restrict B, } } // namespace -void megdnn::arm_common::hgemv_exec(const __fp16* __restrict A, - const __fp16* __restrict B, - __fp16* __restrict C, size_t M, size_t N, - size_t K, size_t Astride, size_t Bstride, - size_t Cstride) { +void megdnn::arm_common::gemv_like(const __fp16* __restrict A, + const __fp16* __restrict B, + __fp16* __restrict C, size_t M, size_t N, + size_t K, size_t Astride, size_t Bstride, + size_t Cstride) { megdnn_assert((M <= 4) || (M == 8 && K <= 2) || (N == 1 && Bstride == 1)); if (N == 1) { return hgemv_naive_n(A, B, C, M, N, K, Astride, Bstride, Cstride); diff --git a/dnn/src/arm_common/matrix_mul/fp16/hgemv.h b/dnn/src/arm_common/matrix_mul/fp16/hgemv.h index 8a2fa952..68a186b7 100644 --- a/dnn/src/arm_common/matrix_mul/fp16/hgemv.h +++ b/dnn/src/arm_common/matrix_mul/fp16/hgemv.h @@ -16,13 +16,14 @@ namespace megdnn { namespace arm_common { -void hgemv_exec(const __fp16* __restrict A, const __fp16* __restrict B, - __fp16* __restrict C, size_t M, size_t N, size_t K, - size_t Astride, size_t Bstride, size_t Cstride); - bool is_hgemv_preferred(bool transposeA, bool transposeB, size_t M, size_t N, size_t K, size_t /*LDA*/, size_t LDB, size_t /*LDC*/); +void gemv_like(const __fp16* __restrict A, const __fp16* __restrict B, + __fp16* __restrict C, size_t M, size_t N, size_t K, + size_t Astride, size_t Bstride, size_t Cstride); + + } // namespace aarch64 } // namespace megdnn diff --git a/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.cpp b/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.cpp index 31e0f535..01f50a45 100644 --- a/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.cpp +++ b/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.cpp @@ -42,40 +42,6 @@ void sgemv_naive_n(const float* __restrict A, const float* __restrict B, #define calculate(i) sum##i = vmlaq_f32(sum##i, a##i, b0); #define vstore(i) C[(m + i) * Cstride] = vaddvq_f32(sum##i) + acc##i; size_t m = 0; - for (; m + 4 <= M; m += 4) { - float acc0, acc1, acc2, acc3; - float32x4_t a0, a1, a2, a3, b0; - float32x4_t sum0, sum1, sum2, sum3; - UNROLL_OUT(vdupq_sum, 4) - size_t k = 0; - for (; k + 4 <= K; k += 4) { - UNROLL_OUT(loadA, 4) - UNROLL_OUT(loadB, 1) - UNROLL_OUT(calculate, 4) - } - UNROLL_OUT(reset_acc, 4) - for (; k < K; ++k) { - UNROLL_OUT(acc_calu, 4) - } - UNROLL_OUT(vstore, 4) - } - for (; m + 2 <= M; m += 2) { - float acc0, acc1; - float32x4_t a0, a1, b0; - float32x4_t sum0, sum1; - UNROLL_OUT(vdupq_sum, 2) - size_t k = 0; - for (; k + 4 <= K; k += 4) { - UNROLL_OUT(loadA, 2) - UNROLL_OUT(loadB, 1) - UNROLL_OUT(calculate, 2) - } - UNROLL_OUT(reset_acc, 2) - for (; k < K; ++k) { - UNROLL_OUT(acc_calu, 2) - } - UNROLL_OUT(vstore, 2) - } for (; m < M; m += 1) { float acc0; float32x4_t a0, b0; @@ -107,9 +73,9 @@ void sgemv_naive_n(const float* __restrict A, const float* __restrict B, namespace megdnn { namespace arm_common { -void sgemm_sgemv_like(const float* __restrict A, const float* __restrict B, - float* __restrict C, size_t M, size_t N, size_t K, - size_t Astride, size_t Bstride, size_t Cstride) { +void gemv_like(const float* __restrict A, const float* __restrict B, + float* __restrict C, size_t M, size_t N, size_t K, + size_t Astride, size_t Bstride, size_t Cstride) { megdnn_assert(M < 8 || (M == 8 && K <= 2) || (N == 1 && Bstride == 1)); if (N == 1) { return sgemv_naive_n(A, B, C, M, N, K, Astride, Bstride, Cstride); diff --git a/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.h b/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.h index d1c352be..982d22a0 100644 --- a/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.h +++ b/dnn/src/arm_common/matrix_mul/fp32/exec_sgemv.h @@ -20,9 +20,10 @@ bool is_sgemv_like_preferred(bool row_major, bool transposeA, bool transposeB, size_t /* LDA */, size_t LDB, float beta, size_t /* LDC */); -void sgemm_sgemv_like(const float* __restrict A, const float* __restrict B, - float* __restrict C, size_t M, size_t N, size_t K, - size_t Astride, size_t Bstride, size_t Cstride); +void gemv_like(const float* __restrict A, const float* __restrict B, + float* __restrict C, size_t M, size_t N, size_t K, + size_t Astride, size_t Bstride, size_t Cstride); + } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/arm_common/matrix_mul/int8/gemv.cpp b/dnn/src/arm_common/matrix_mul/int8/gemv.cpp index 2f86fb15..7bc5509f 100644 --- a/dnn/src/arm_common/matrix_mul/int8/gemv.cpp +++ b/dnn/src/arm_common/matrix_mul/int8/gemv.cpp @@ -172,9 +172,10 @@ void gemv_naive_n(const int8_t* __restrict A, const int8_t* __restrict B, } // namespace #endif -bool matmul::is_gemv_like_preferred_int8(bool transposeA, bool transposeB, - size_t M, size_t N, size_t K, - size_t LDA, size_t LDB, size_t LDC) { +bool arm_common::is_gemv_like_preferred_int8(bool transposeA, bool transposeB, + size_t M, size_t N, size_t K, + size_t LDA, size_t LDB, + size_t LDC) { MEGDNN_MARK_USED_VAR(LDA); MEGDNN_MARK_USED_VAR(LDB); MEGDNN_MARK_USED_VAR(LDC); @@ -188,15 +189,16 @@ bool matmul::is_gemv_like_preferred_int8(bool transposeA, bool transposeB, return N == 1 && LDB == 1; } -void matmul::gemv_like_int8(const int8_t* __restrict A, - const int8_t* __restrict B, int32_t* __restrict C, - size_t M, size_t N, size_t K, size_t Astride, - size_t Bstride, size_t Cstride) { +void arm_common::gemv_like(const int8_t* __restrict A, + const int8_t* __restrict B, int32_t* __restrict C, + size_t M, size_t N, size_t K, size_t Astride, + size_t Bstride, size_t Cstride) { megdnn_assert(N == 1); - MIDOUT_BEGIN(megdnn_arm_common_int8_gemv) { + MIDOUT_BEGIN(megdnn_arm_common_int8_gemv, + midout_iv("INT8_gemv_like"_hash)) { return gemv_naive_n(A, B, C, M, N, K, Astride, Bstride, Cstride); - } MIDOUT_END(); + } + MIDOUT_END(); } - // vim: syntax=cpp.doxygen diff --git a/dnn/src/arm_common/matrix_mul/int8/gemv.h b/dnn/src/arm_common/matrix_mul/int8/gemv.h index 5b1a558c..3080167f 100644 --- a/dnn/src/arm_common/matrix_mul/int8/gemv.h +++ b/dnn/src/arm_common/matrix_mul/int8/gemv.h @@ -15,16 +15,15 @@ namespace megdnn { namespace arm_common { -namespace matmul { bool is_gemv_like_preferred_int8(bool transposeA, bool transposeB, size_t M, size_t N, size_t K, size_t LDA, size_t LDB, size_t LDC); -void gemv_like_int8(const int8_t* __restrict A, const int8_t* __restrict B, - int32_t* __restrict C, size_t M, size_t N, size_t K, - size_t Astride, size_t Bstride, size_t Cstride); -} // namespace matmul +void gemv_like(const int8_t* __restrict A, const int8_t* __restrict B, + int32_t* __restrict C, size_t M, size_t N, size_t K, + size_t Astride, size_t Bstride, size_t Cstride); + } // namespace arm_common } // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp index 611d7d68..84a8bcb7 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.cpp @@ -54,7 +54,7 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); auto matmul_param = - get_matmul_kern_param(param, OH * OW, compt_oc_block_size); + utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); auto pack_mode = m_matmul_algo->packmode(); if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { @@ -92,7 +92,6 @@ size_t ConvBiasImpl::AlgoConv1x1::get_workspace( SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( ConvBiasImpl* opr, const NCBKernSizeParam& param) const { SmallVector ret_kern; - size_t OH = param.osz[0]; size_t OW = param.osz[1]; size_t OC = param.filter_meta.ocpg; @@ -102,7 +101,7 @@ SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); auto matmul_param = - get_matmul_kern_param(param, OH * OW, compt_oc_block_size); + utils::get_matmul_kern_param(param, OH * OW, compt_oc_block_size); WorkspaceBundle whole_bundle = {nullptr, {}}; WorkspaceBundle thread_bundle = {nullptr, {}}; WorkspaceBundle matmul_bundle = {nullptr, {}}; @@ -138,7 +137,7 @@ SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( } //! get thread bundle - thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2), + thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), compt_oc_block_size); Conv1x1StrategyBase* conv1x1_strategy = @@ -178,7 +177,6 @@ SmallVector ConvBiasImpl::AlgoConv1x1::dispatch_kerns( } } ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); - return ret_kern; } @@ -201,8 +199,11 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) return false; - if (param.src_type.enumv() != param.filter_type.enumv() && - param.src_type.enumv() != DTypeEnum::Int8 && + if(param.src_type.enumv() != param.filter_type.enumv()) { + return false; + } + + if (param.src_type.enumv() != DTypeEnum::Int8 && param.src_type.enumv() != DTypeEnum::QuantizedS8 && param.src_type.enumv() != DTypeEnum::Quantized8Asymm && #if !MEGDNN_DISABLE_FLOAT16 @@ -211,6 +212,7 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, param.src_type.enumv() != DTypeEnum::Float32) { return false; } + //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode //! is identity otherwise return false mean that 8x8x32 and 8x8x16 //! not support PostProcess @@ -233,7 +235,8 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, size_t OH = param.osz[0]; size_t OW = param.osz[1]; - MatrixMulImpl::KernSizeParam matmul_param = get_matmul_kern_param( + + MatrixMulImpl::KernSizeParam matmul_param = utils::get_matmul_kern_param( param, OH * OW, get_oc_tile_size_heuristic(param)); bool matmul_usable = m_matmul_algo->usable(matmul_param); @@ -250,3 +253,27 @@ bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, MIDOUT_END(); return false; } + +bool ConvBiasImpl::AlgoConv1x1::is_preferred( + ConvBiasImpl*, const NCBKernSizeParam& param) const { + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + if (OH * OW != 1) { + return true; + } else { +#if (MEGDNN_ARMV7 || MEGDNN_AARCH64) + if (param.src_type.enumv() == DTypeEnum::Int8 && + param.filter_type.enumv() == DTypeEnum::Int8 && + param.dst_type.enumv() == DTypeEnum::Int16) { + return true; + } +#elif MEGDNN_X86 + size_t OC = param.filter_meta.ocpg; + if (OC > 2 || param.src_type.enumv() == DTypeEnum::Float32) + return true; +#endif + return false; + } +} + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos.h b/dnn/src/fallback/conv_bias/conv1x1/algos.h index b5c76664..860f1f42 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/algos.h +++ b/dnn/src/fallback/conv_bias/conv1x1/algos.h @@ -41,9 +41,7 @@ public: SmallVector dispatch_kerns( ConvBiasImpl* opr, const NCBKernSizeParam& param) const override; - bool is_preferred(ConvBiasImpl*, const NCBKernSizeParam&) const override{ - return true; - } + bool is_preferred(ConvBiasImpl*, const NCBKernSizeParam&) const override; protected: size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const; diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp b/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp new file mode 100644 index 00000000..c4c1e147 --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp @@ -0,0 +1,448 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h" +#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" + +#include "src/common/opr_delegate.h" +#include "src/fallback/conv_bias/common.h" +#include "src/fallback/conv_bias/opr_impl.h" + +#include "megdnn/opr_param_defs.h" +#include "src/naive/convolution/helper.h" + +#include "src/fallback/matrix_mul/gemv.h" +#if MEGDNN_X86 +#include "src/x86/conv_bias/postprocess_helper.h" +#elif (MEGDNN_ARMV7 || MEGDNN_AARCH64) +#include "src/arm_common/conv_bias/postprocess_helper.h" +#include "src/arm_common/matrix_mul/fp32/exec_sgemv.h" +#include "src/arm_common/matrix_mul/fp16/hgemv.h" +#include "src/arm_common/matrix_mul/int8/gemv.h" +#endif + +#include "midout.h" +MIDOUT_DECL(megdnn_fallback_conv1x1_gemv) + +using namespace megdnn; +using namespace fallback; +#if MEGDNN_X86 +using namespace x86; +#endif +using namespace conv1x1; + +namespace { + +#if MEGDNN_X86 +template +struct GemvLike { + inline static void do_gemv(const stype* A, const stype* B, btype* C, + size_t M, size_t N, size_t K, size_t LDA, + size_t LDB, size_t LDC, DType src, + DType filter) { + megdnn_throw("x86 conv1x1 gemv only supports format : NCHW"); + } +}; + +template +struct GemvLike { + inline static void do_gemv(const stype* A, const stype* B, btype* C, + size_t M, size_t N, size_t K, size_t LDA, + size_t LDB, size_t LDC, DType src, + DType filter) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + megdnn::fallback::gemv_like(A, B, C, M, N, K, LDA, LDB, + LDC); + } +}; + +#elif MEGDNN_AARCH64 || MEGDNN_ARMV7 +template +struct GemvLike { + inline static void do_gemv(const stype* A, const stype* B, btype* C, + size_t M, size_t N, size_t K, size_t LDA, + size_t LDB, size_t LDC, DType src, + DType filter) { + megdnn_throw("arm conv1x1 gemv only supports format : NCHW"); + } +}; + +template +struct GemvLike { + inline static void do_gemv(const stype* A, const stype* B, btype* C, + size_t M, size_t N, size_t K, size_t LDA, + size_t LDB, size_t LDC, DType src, + DType filter) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + megdnn::arm_common::gemv_like(A, B, C, M, N, K, LDA, LDB, LDC); + } +}; + +template <> +struct GemvLike { + inline static void do_gemv(const dt_int8* A, const dt_int8* B, dt_int16* C, + size_t M, size_t N, size_t K, size_t LDA, + size_t LDB, size_t LDC, DType src, + DType filter) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + megdnn::fallback::gemv_like(A, B, C, M, N, K, LDA, + LDB, LDC); + } +}; + +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> +struct GemvLike { + inline static void do_gemv(const dt_float16* A, const dt_float16* B, + dt_float16* C, size_t M, size_t N, size_t K, + size_t LDA, size_t LDB, size_t LDC, DType src, + DType filter) { + MEGDNN_MARK_USED_VAR(src); + MEGDNN_MARK_USED_VAR(filter); + megdnn::arm_common::gemv_like(reinterpret_cast(A), + reinterpret_cast(B), + reinterpret_cast<__fp16*>(C), M, N, K, + LDA, LDB, LDC); + } +}; +#endif +#endif + +template <> +struct GemvLike { + inline static void do_gemv(const dt_uint8* A, const dt_uint8* B, + dt_int32* C, size_t M, size_t N, size_t K, + size_t LDA, size_t LDB, size_t LDC, DType src, + DType filter) { + uint8_t zp0 = src.param().zero_point; + uint8_t zp1 = filter.param().zero_point; + megdnn::fallback::gemv_like(A, B, C, M, N, K, LDA, + LDB, LDC, zp0, zp1); + } +}; + +template +struct Conv1x1GemvWorker { + static void exec(WorkspaceBundle& whole_bundle, + WorkspaceBundle& thread_bundle, size_t oc_tile_size, + const ConvBiasImpl::NCBKernSizeParam& param, + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) { + whole_bundle.set(ncb_param.workspace_ptr); + + size_t OC = param.filter_meta.ocpg; + size_t IC = param.filter_meta.icpg; + + size_t batch_id = ncb_index.ndrange_id[0]; + size_t group_id = ncb_index.ndrange_id[1]; + size_t oc_tile_id_in_group = ncb_index.ndrange_id[2]; + size_t thread_id = ncb_index.thread_id; + + size_t oc_start = oc_tile_size * oc_tile_id_in_group; + size_t oc_end = oc_start + oc_tile_size; + oc_end = (oc_end <= OC ? oc_end : OC); + + size_t numbers_of_ncb_filter_offset = + oc_tile_size * IC * oc_tile_id_in_group; + const src_ctype* Aptr = ncb_param.filter(group_id) + + numbers_of_ncb_filter_offset; + + const src_ctype* Bptr = ncb_param.src(batch_id, group_id); + + size_t thread_offset = thread_bundle.total_size_in_bytes() * thread_id; + size_t bytes_offset_of_matmul_dst_this_thread = + thread_offset + thread_bundle.get_size(0); + bias_ctype* matmul_temp_dst = reinterpret_cast( + reinterpret_cast(whole_bundle.get(0)) + + bytes_offset_of_matmul_dst_this_thread); + + size_t numbers_of_ncb_dst_offset = oc_tile_size * oc_tile_id_in_group; + dst_ctype* conv_bias_dst = + ncb_param.dst(batch_id, group_id) + + numbers_of_ncb_dst_offset; + + bool is_dst_8bit = + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + bias_ctype* gemv_dst = + is_dst_8bit ? matmul_temp_dst + : reinterpret_cast(conv_bias_dst); + + GemvLike::do_gemv( + Aptr, Bptr, gemv_dst, oc_end - oc_start, 1, IC, IC, 1, 1, + ncb_param.filter_type, ncb_param.src_type); + + //! do postprocess + void* bias_ptr = nullptr; + if (param.bias_mode == megdnn::BiasMode::BIAS) { + bias_ptr = static_cast(const_cast( + ncb_param.bias(batch_id, group_id) + + numbers_of_ncb_dst_offset)); + } else { + bias_ptr = static_cast(const_cast( + ncb_param.bias(batch_id, group_id) + oc_start)); + } + + PostProcess::run( + gemv_dst, bias_ptr, conv_bias_dst, param.bias_mode, + param.nonlineMode, param.bias_type, param.dst_type, 1_z, + oc_end - oc_start, 1, 1, 1); + } +}; + +} // namespace + +size_t ConvBiasImpl::AlgoConv1x1Gemv::get_oc_tile_size_heuristic( + const NCBKernSizeParam& param) const { + size_t OC = param.filter_meta.ocpg; + size_t oc_block_size_one_thread = div_ceil(OC, param.nr_threads); + return round_up(oc_block_size_one_thread, 16); +} + +size_t ConvBiasImpl::AlgoConv1x1Gemv::get_workspace( + ConvBiasImpl*, const NCBKernSizeParam& param) const { + MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, + midout_iv("AlgoConv1x1Gemv::get_workspace"_hash)) { + size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); + auto thread_bundle = + utils::get_thread_bundle(param, 0, compt_oc_block_size); + return WorkspaceBundle{ + nullptr, + {thread_bundle.total_size_in_bytes() * param.nr_threads}} + .total_size_in_bytes(); + } + MIDOUT_END(); +} + +SmallVector +ConvBiasImpl::AlgoConv1x1Gemv::dispatch_kerns( + ConvBiasImpl* opr, const NCBKernSizeParam& param) const { + SmallVector ret_kern; + size_t OC = param.filter_meta.ocpg; + size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); + size_t GROUP = param.filter_meta.group; + size_t BATCH = param.n; + size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); + + //! get thread bundle + auto thread_bundle = + utils::get_thread_bundle(param, 0, compt_oc_block_size); + auto whole_bundle = WorkspaceBundle{ + nullptr, {thread_bundle.total_size_in_bytes() * param.nr_threads}}; + + using conv1x1_gemv_kern = + std::function; + conv1x1_gemv_kern conv1x1_gemv_worker = nullptr; + +#define cb1(_format, _dt, _post_ctype, _postprocess_mode, _midout_tag) \ + MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, midout_iv(_midout_tag)) { \ + if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ + conv1x1_gemv_worker = \ + Conv1x1GemvWorker<_dt, _dt, _dt, _post_ctype, _post_ctype, \ + _postprocess_mode, _format>::exec; \ + } \ + } \ + MIDOUT_END() + +#define cb2(_format, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \ + _bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag) \ + MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, midout_iv(_midout_tag)) { \ + if (param.filter_type.enumv() == param.src_type.enumv() && \ + param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ + param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ + conv1x1_gemv_worker = \ + Conv1x1GemvWorker<_src_ctype, _bias_ctype, _dst_ctype, \ + DTypeTrait<_i_bias_type>::ctype, \ + DTypeTrait<_i_dst_type>::ctype, \ + _postprocess_mode, _format>::exec; \ + } \ + } \ + MIDOUT_END() + + switch (opr->param().format) { + case param::ConvBias::Format::NCHW: + cb1(param::ConvBias::Format::NCHW, dt_float32, dt_float32, + PostprocessMode::FLOAT, "NCHW::GEMV::FLOAT"_hash); +#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC + cb1(param::ConvBias::Format::NCHW, dt_float16, __fp16, + PostprocessMode::FLOAT, "NCHW::GEMV::FLOAT16_FP16"_hash); +#endif + cb2(param::ConvBias::Format::NCHW, dt_int8, dt_int32, dt_int32, + dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, + "NCHW::GEMV::INT8x8x32_INT32"_hash); + cb2(param::ConvBias::Format::NCHW, dt_int8, dt_int16, dt_int16, + dt_int8, dt_int16, dt_int16, PostprocessMode::NO_PROCESS, + "NCHW::GEMV::INT8x8x16_INT16"_hash); + cb2(param::ConvBias::Format::NCHW, dtype::QuantizedS8, + dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, + "NCHW::GEMV::QINT8x8x32_QINT32"_hash); + cb2(param::ConvBias::Format::NCHW, dtype::QuantizedS8, + dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, + dt_int8, PostprocessMode::QUANTIZED, + "NCHW::GEMV::QINT8x8x32_QINT8"_hash); + cb2(param::ConvBias::Format::NCHW, dtype::Quantized8Asymm, + dtype::QuantizedS32, dtype::QuantizedS32, dt_uint8, dt_int32, + dt_int32, PostprocessMode::NO_PROCESS, + "NCHW::GEMV::QUINT8x8x32_QINT32"_hash); + cb2(param::ConvBias::Format::NCHW, dtype::Quantized8Asymm, + dtype::QuantizedS32, dtype::Quantized8Asymm, dt_uint8, dt_int32, + dt_uint8, PostprocessMode::QUANTIZED, + "NCHW::GEMV::QUINT8x8x32_QUINT8"_hash); + break; + + default: + megdnn_throw("Invalid Format"); + break; + } +#undef cb1 +#undef cb2 + + megdnn_assert(conv1x1_gemv_worker, "No suitable gemv worker"); + + auto kern_compt = + [compt_oc_block_size, param, conv1x1_gemv_worker, whole_bundle, + thread_bundle]( + const ConvBiasImpl::NCBKernParam& ncb_param, + const ConvBiasImpl::NCBKernIndex& ncb_index) mutable { + conv1x1_gemv_worker(whole_bundle, thread_bundle, + compt_oc_block_size, param, ncb_param, + std::move(ncb_index)); + }; + ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); + return ret_kern; +} + +bool ConvBiasImpl::AlgoConv1x1Gemv::usable(ConvBiasImpl* opr, + const NCBKernSizeParam& param, + AlgoSelectionStrategy) const { + MIDOUT_BEGIN(megdnn_fallback_conv1x1_gemv, + midout_iv("AlgoConv1x1Gemv::usable"_hash)) { + //! whether 1x1 + size_t FH = param.filter_meta.spatial[0], + FW = param.filter_meta.spatial[1]; + size_t PH = param.filter_meta.padding[0], + PW = param.filter_meta.padding[1]; + size_t SH = param.filter_meta.stride[0], + SW = param.filter_meta.stride[1]; + + if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) { + return false; + } + + //! whether gemv + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + if (OH * OW != 1) { + return false; + } + + //! even no naive support in gemv + if ((param.src_type.enumv() == param.filter_type.enumv() && + param.src_type.enumv() == DTypeEnum::Int16) && + param.dst_type.enumv() == DTypeEnum::Int32) { + return false; + } + + //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode + //! is identity otherwise return false mean that 8x8x32 and 8x8x16 + //! not support PostProcess + if (param.dst_type.enumv() == DTypeEnum::Int16 || + param.dst_type.enumv() == DTypeEnum::Int32 || + param.dst_type.enumv() == DTypeEnum::QuantizedS32) { + if (param.bias_mode != megdnn::BiasMode::NO_BIAS || + param.nonlineMode != megdnn::NonlineMode::IDENTITY) { + return false; + } + } + + //! supports a few dtypes + if (param.src_type.enumv() != param.filter_type.enumv()) { + return false; + } + + if (param.src_type.enumv() != DTypeEnum::Int8 && + param.src_type.enumv() != DTypeEnum::QuantizedS8 && + param.src_type.enumv() != DTypeEnum::Quantized8Asymm && +#if !MEGDNN_DISABLE_FLOAT16 + param.src_type.enumv() != DTypeEnum::Float16 && +#endif + param.src_type.enumv() != DTypeEnum::Float32) { + return false; + } + + bool is_param_ok = + (param.filter_meta.dilation[0] == + param.filter_meta.dilation[1] && + param.filter_meta.dilation[0] == 1) && + param.compute_mode == param::ConvBias::ComputeMode::DEFAULT; + + bool is_format_and_dtype_ok = false; +#if MEGDNN_X86 + if (opr->param().format == param::ConvBias::Format::NCHW) { + //! x86 supports all dtypes in NCHW + is_format_and_dtype_ok = true; + } +#elif MEGDNN_AARCH64 || MEGDNN_ARMV7 + //! add NCHW44 and NCHW44_DOT support in the future + if (opr->param().format == param::ConvBias::Format::NCHW) { + //! NCHW format supports all dtype + is_format_and_dtype_ok = true; + } +#endif + return is_param_ok && is_format_and_dtype_ok; + } + MIDOUT_END(); + return false; +} + +bool ConvBiasImpl::AlgoConv1x1Gemv::is_preferred( + ConvBiasImpl*, const NCBKernSizeParam& param) const { + size_t OC = param.filter_meta.ocpg; + if (OC <= 2 && param.src_type.enumv() != DTypeEnum::Float32) + return true; +#if (MEGDNN_ARMV7 || MEGDNN_AARCH64) + //! maybe add support for QuantizedAsym in the future + return (param.src_type.enumv() == DTypeEnum::Int8 && + param.filter_type.enumv() == DTypeEnum::Int8 && + param.dst_type.enumv() == DTypeEnum::Int32) || + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.filter_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.filter_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS32) || +#if !MEGDNN_DISABLE_FLOAT16 + (param.src_type.enumv() == DTypeEnum::Float16 && + param.filter_type.enumv() == DTypeEnum::Float16 && + param.dst_type.enumv() == DTypeEnum::Float16) || +#endif + (param.src_type.enumv() == DTypeEnum::Float32 && + param.filter_type.enumv() == DTypeEnum::Float32 && + param.dst_type.enumv() == DTypeEnum::Float32); +#else + return false; +#endif +} + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h b/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h new file mode 100644 index 00000000..38b00a20 --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h @@ -0,0 +1,47 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "megdnn/thin/small_vector.h" +#include "src/common/utils.h" +#include "src/fallback/conv_bias/opr_impl.h" + +namespace megdnn { +namespace fallback { + +class ConvBiasImpl::AlgoConv1x1Gemv final : public AlgoBase { +public: + AlgoConv1x1Gemv() = default; + + bool is_reproducible() const override { return true; } + + const char* name() const override { + return "CONV1x1_GEMV"; + } + + bool usable(ConvBiasImpl* opr, const NCBKernSizeParam& param, + AlgoSelectionStrategy algo_selection_strategy) const override; + size_t get_workspace(ConvBiasImpl*, + const NCBKernSizeParam& param) const override; + SmallVector dispatch_kerns( + ConvBiasImpl* opr, const NCBKernSizeParam& param) const override; + + bool is_preferred(ConvBiasImpl*, const NCBKernSizeParam&) const override; + +protected: + size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const; +}; + +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h index 581d2d69..e204f90d 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h @@ -11,30 +11,12 @@ #pragma once -#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" +#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" namespace megdnn { namespace fallback { namespace conv1x1 { -namespace { -//! get_thread_bundle -WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param, - size_t matmul_c_size, size_t oc_tile_size) { - //! for some cases, matmul result need temp space to store - size_t OH = param.osz[0]; - size_t OW = param.osz[1]; - bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - size_t matmul_dst_bytes_per_thread = - is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0; - return WorkspaceBundle{nullptr, - {matmul_c_size, matmul_dst_bytes_per_thread}}; -} -} // anonymous namespace - template class Conv1x1Kerns { public: @@ -51,7 +33,7 @@ public: //! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH //! * OW this does not bother packb bytes auto matmul_bundle = matmul_algo->get_bundle(matmul_param); - auto thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2), + auto thread_bundle = utils::get_thread_bundle(param, matmul_bundle.get_size(2), oc_tile_size); //! size per thread @@ -86,7 +68,7 @@ public: const MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size) { size_t matmul_size = matmul_algo->get_workspace(matmul_param); - auto thread_bundle = get_thread_bundle(param, matmul_size, oc_tile_size); + auto thread_bundle = utils::get_thread_bundle(param, matmul_size, oc_tile_size); //! size per thread size_t all_threads_bytes = thread_bundle.total_size_in_bytes() * param.nr_threads; diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp index feeb4b72..c5f9151d 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.cpp @@ -10,8 +10,8 @@ * implied. */ +#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" #include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" -#include #include "midout.h" @@ -20,53 +20,7 @@ MIDOUT_DECL(megdnn_fallback_conv1x1_factory_strategy) namespace megdnn { namespace fallback { namespace conv1x1 { - namespace { - -struct StrategyHashParam { - ConvBiasImpl::NCBKernSizeParam param; - param::ConvBias::Format format; - MatrixMulImpl::AlgoBase::PackMode packmode; -}; - -struct StrategyHashParamHash { - std::size_t operator()(const StrategyHashParam& sparam) const { - constexpr size_t base = 1; //! avoid hashkey is zero - std::size_t result = - static_cast(sparam.param.src_type.enumv()) + base; - result = result ^ - ((static_cast(sparam.param.dst_type.enumv()) + - base) - << 3); - result = result ^ - ((static_cast(sparam.param.filter_type.enumv()) + - base) - << 6); - result = result ^ - ((static_cast(sparam.param.bias_type.enumv()) + - base) - << 9); - result = result ^ - ((static_cast(sparam.format) + base) << 12); - result = result ^ - ((static_cast(sparam.packmode) + base) << 15); - return result; - }; -}; - -struct StrategyHashParamEqual { - bool operator()(const StrategyHashParam& param1, - const StrategyHashParam& param2) const { - bool flags = true; - flags = param1.param.src_type == param2.param.src_type && flags; - flags = param1.param.filter_type == param2.param.filter_type && flags; - flags = param1.param.bias_type == param2.param.bias_type && flags; - flags = param1.param.dst_type == param2.param.dst_type && flags; - flags = param1.format == param2.format && flags; - flags = param1.packmode == param2.packmode && flags; - return flags; - }; -}; //! NOTE: must keep consistence with can_make_conv1x1_strategy when you modify //! this function std::unique_ptr create_conv1x1_strategy( @@ -176,39 +130,14 @@ std::unique_ptr create_conv1x1_strategy( megdnn_throw("Invalid Data Type"); return nullptr; } - -class StrategyDelegationStorage { -public: - Conv1x1StrategyBase* get(const ConvBiasImpl::NCBKernSizeParam& param, - MatrixMulImpl::AlgoBase::PackMode pack_mode, - param::ConvBias::Format format) { - MEGDNN_LOCK_GUARD(m_mtx); - StrategyHashParam sparam; - sparam.param = param; - sparam.format = format; - sparam.packmode = pack_mode; - if (m_map_strategies.find(sparam) == m_map_strategies.end()) { - auto strategy = create_conv1x1_strategy(param, pack_mode, format); - m_map_strategies[sparam] = std::move(strategy); - } - return m_map_strategies[sparam].get(); - } - -private: - std::mutex m_mtx; - std::unordered_map, - StrategyHashParamHash, StrategyHashParamEqual> - m_map_strategies; -}; - } // anonymous namespace Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy( const ConvBiasImpl::NCBKernSizeParam& param, MatrixMulImpl::AlgoBase::PackMode pack_mode, param::ConvBias::Format format) { - static StrategyDelegationStorage storage; - return storage.get(param, pack_mode, format); + static utils::StrategyDelegationStorage storage; + return storage.get(param, pack_mode, format, create_conv1x1_strategy); } bool Conv1x1Factory::can_make_conv1x1_strategy( @@ -277,3 +206,5 @@ bool Conv1x1Factory::can_make_conv1x1_strategy( } // namespace conv1x1 } // namespace fallback } // namespace megdnn + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h index 649e7092..5fe94f4d 100644 --- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h @@ -13,6 +13,8 @@ #pragma once #include "src/fallback/conv_bias/opr_impl.h" +#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" + #if MEGDNN_X86 #include "src/x86/conv_bias/postprocess_helper.h" #elif (MEGDNN_ARMV7 || MEGDNN_AARCH64) @@ -27,44 +29,6 @@ namespace conv1x1 { using namespace x86; #endif -namespace { - -//! get_matmul_kern_param -MatrixMulImpl::KernSizeParam get_matmul_kern_param( - const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) { - size_t M = m; - size_t N = n; - size_t K = param.filter_meta.icpg; //! K = IC - size_t LDA = K, LDB = N, LDC = N; - bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && - param.dst_type.enumv() == DTypeEnum::QuantizedS8) || - (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && - param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); - size_t pack_c_size = pack_size(param.filter_meta.format); - auto format = param::MatrixMul::Format::DEFAULT; - if (param.filter_meta.format == param::ConvBias::Format::NCHW44) { - format = param::MatrixMul::Format::MK4; - } else if (param.filter_meta.format == - param::ConvBias::Format::NCHW44_DOT) { - format = param::MatrixMul::Format::MK4_DOT; - } - - return {param.filter_type, - param.src_type, - is_dst_8bit ? param.bias_type : param.dst_type, - M, - N, - K, - LDA * pack_c_size, - LDB * pack_c_size, - LDC * pack_c_size, - false, - false, - param::MatrixMul::ComputeMode::DEFAULT, - format}; -} -} // namespace - class Conv1x1StrategyBase { public: virtual void packA(WorkspaceBundle& whole_bundle, @@ -134,7 +98,7 @@ public: size_t IC = param.filter_meta.icpg; MatrixMulImpl::KernParam matmul_kern_param; static_cast(matmul_kern_param) = - get_matmul_kern_param(param, OH * OW, oc_end - oc_start); + utils::get_matmul_kern_param(param, OH * OW, oc_end - oc_start); size_t bytes_offset_of_a_panel = group_id * packa_bytes_per_group + @@ -176,8 +140,7 @@ public: MatrixMulImpl::KernParam matmul_kern_param; static_cast(matmul_kern_param) = - get_matmul_kern_param(param, OH * OW, OC); - + utils::get_matmul_kern_param(param, OH * OW, OC); rep(batch, BATCH) { rep(g, GROUP) { @@ -238,7 +201,7 @@ public: MatrixMulImpl::KernParam matmul_kern_param; static_cast(matmul_kern_param) = - get_matmul_kern_param(param, OH * OW, oc_end - oc_start); + utils::get_matmul_kern_param(param, OH * OW, oc_end - oc_start); size_t bytes_offset_of_a_panel = group_id * packa_bytes_per_group + @@ -328,7 +291,6 @@ public: MatrixMulImpl::AlgoBase::PackMode pack_mode, param::ConvBias::Format format); }; - } // namespace conv1x1 } // namespace fallback } // namespace megdnn diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp new file mode 100644 index 00000000..c8e263aa --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp @@ -0,0 +1,75 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "src/fallback/conv_bias/conv1x1/conv1x1_utils.h" + +namespace megdnn { +namespace fallback { +namespace conv1x1 { +namespace utils{ + +//! get_thread_bundle +WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param, + size_t matmul_c_size, size_t oc_tile_size) { + //! for some cases, matmul result need temp space to store + size_t OH = param.osz[0]; + size_t OW = param.osz[1]; + bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + size_t matmul_dst_bytes_per_thread = + is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0; + return WorkspaceBundle{nullptr, + {matmul_c_size, matmul_dst_bytes_per_thread}}; +} + +//! get_matmul_kern_param +MatrixMulImpl::KernSizeParam get_matmul_kern_param( + const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) { + size_t M = m; + size_t N = n; + size_t K = param.filter_meta.icpg; //! K = IC + size_t LDA = K, LDB = N, LDC = N; + bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && + param.dst_type.enumv() == DTypeEnum::QuantizedS8) || + (param.src_type.enumv() == DTypeEnum::Quantized8Asymm && + param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); + size_t pack_c_size = pack_size(param.filter_meta.format); + auto format = param::MatrixMul::Format::DEFAULT; + if (param.filter_meta.format == param::ConvBias::Format::NCHW44) { + format = param::MatrixMul::Format::MK4; + } else if (param.filter_meta.format == + param::ConvBias::Format::NCHW44_DOT) { + format = param::MatrixMul::Format::MK4_DOT; + } + + return {param.filter_type, + param.src_type, + is_dst_8bit ? param.bias_type : param.dst_type, + M, + N, + K, + LDA * pack_c_size, + LDB * pack_c_size, + LDC * pack_c_size, + false, + false, + param::MatrixMul::ComputeMode::DEFAULT, + format}; +} + +} // namespace utils +} // namespace conv1x1 +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h new file mode 100644 index 00000000..e6b8bdbe --- /dev/null +++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h @@ -0,0 +1,102 @@ +/** + * \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include +#include "src/fallback/conv_bias/opr_impl.h" + +namespace megdnn { +namespace fallback { +namespace conv1x1 { +namespace utils { + +struct StrategyHashKey { + ConvBiasImpl::NCBKernSizeParam param; + param::ConvBias::Format format; + MatrixMulImpl::AlgoBase::PackMode packmode; +}; + +struct StrategyHasher { + std::size_t operator()(const StrategyHashKey& key) const { + constexpr size_t base = 1; //! avoid hashkey is zero + std::size_t result = + static_cast(key.param.src_type.enumv()) + base; + result = result ^ + ((static_cast(key.param.dst_type.enumv()) + base) + << 3); + result = result ^ + ((static_cast(key.param.filter_type.enumv()) + + base) + << 6); + result = result ^ + ((static_cast(key.param.bias_type.enumv()) + base) + << 9); + result = result ^ ((static_cast(key.format) + base) << 12); + result = result ^ + ((static_cast(key.packmode) + base) << 15); + return result; + } +}; + +struct StrategyHashKeyEqual { + bool operator()(const StrategyHashKey& key1, + const StrategyHashKey& key2) const { + return key1.param.src_type == key2.param.src_type && + key1.param.filter_type == key2.param.filter_type && + key1.param.bias_type == key2.param.bias_type && + key1.param.dst_type == key2.param.dst_type && + key1.format == key2.format && key1.packmode == key2.packmode; + } +}; + +template +class StrategyDelegationStorage { + using creator = std::function( + const ConvBiasImpl::NCBKernSizeParam&, + MatrixMulImpl::AlgoBase::PackMode, param::ConvBias::Format)>; + +public: + T* get(const ConvBiasImpl::NCBKernSizeParam& param, + MatrixMulImpl::AlgoBase::PackMode pack_mode, + param::ConvBias::Format format, creator Fun) { + MEGDNN_LOCK_GUARD(m_mtx); + StrategyHashKey key; + key.param = param; + key.format = format; + key.packmode = pack_mode; + if (m_map_strategies.find(key) == m_map_strategies.end()) { + auto strategy = Fun(param, pack_mode, format); + m_map_strategies[key] = std::move(strategy); + } + return m_map_strategies[key].get(); + } + +private: + std::mutex m_mtx; + std::unordered_map, StrategyHasher, + StrategyHashKeyEqual> + m_map_strategies; +}; + +//! get_thread_bundle +WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param, + size_t matmul_c_size, size_t oc_tile_size); +//! get_matmul_kern_param +MatrixMulImpl::KernSizeParam get_matmul_kern_param( + const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m); + +} // namespace utils +} // namespace conv1x1 +} // namespace fallback +} // namespace megdnn + +// vim: syntax=cpp.doxygen \ No newline at end of file diff --git a/dnn/src/fallback/conv_bias/im2col/algos.cpp b/dnn/src/fallback/conv_bias/im2col/algos.cpp index 051affe1..d429a775 100644 --- a/dnn/src/fallback/conv_bias/im2col/algos.cpp +++ b/dnn/src/fallback/conv_bias/im2col/algos.cpp @@ -647,8 +647,11 @@ bool ConvBiasImpl::AlgoIm2col::usable( return false; } - if (param.src_type.enumv() != param.filter_type.enumv() && - param.src_type.enumv() != DTypeEnum::Int8 && + if(param.src_type.enumv() != param.filter_type.enumv()) { + return false; + } + + if (param.src_type.enumv() != DTypeEnum::Int8 && param.src_type.enumv() != DTypeEnum::QuantizedS8 && param.src_type.enumv() != DTypeEnum::Quantized8Asymm && #if !MEGDNN_DISABLE_FLOAT16 diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index f1377da0..2b9b5597 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -16,6 +16,7 @@ #include "src/common/utils.h" #include "src/fallback/conv_bias/algos.h" #include "src/fallback/conv_bias/conv1x1/algos.h" +#include "src/fallback/conv_bias/conv1x1/algos_conv1x1_gemv.h" #include "src/fallback/conv_bias/im2col/algos.h" #include "src/fallback/conv_bias/opr_impl.h" #include "src/naive/convolution/algorithms.h" @@ -53,6 +54,10 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj { public: AlgoPack() { + + refhold.emplace_back(new AlgoConv1x1Gemv()); + all_algos.emplace_back(refhold.back().get()); + static CpuOprDelegationStorage<> storage; auto matmul_opr = storage.get(); auto&& matmul_algos = diff --git a/dnn/src/fallback/conv_bias/opr_impl.h b/dnn/src/fallback/conv_bias/opr_impl.h index 96155360..a70fd474 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.h +++ b/dnn/src/fallback/conv_bias/opr_impl.h @@ -259,6 +259,7 @@ private: class AlgoNaive; class AlgoIm2col; class AlgoConv1x1; + class AlgoConv1x1Gemv; class AlgoWinogradF32; class AlgoWinogradF32_4x4; class AlgoWinogradQS8; diff --git a/dnn/src/fallback/matrix_mul/algos.cpp b/dnn/src/fallback/matrix_mul/algos.cpp index 05f55416..364780cf 100644 --- a/dnn/src/fallback/matrix_mul/algos.cpp +++ b/dnn/src/fallback/matrix_mul/algos.cpp @@ -11,6 +11,7 @@ #include "src/fallback/matrix_mul/algos.h" #include "src/fallback/matrix_mul/gemm_impl.h" +#include "src/fallback/matrix_mul/gemv.h" #include "src/fallback/matrix_mul/generic_strategy.h" #include "midout.h" @@ -71,39 +72,6 @@ MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL(AlgoF32K8x12x1, megdnn_fb_matmul_f32_kern, float); /* ===================== gemv algo ===================== */ - -namespace { -template -void gemm_gemv_like(const MatrixMulImpl::KernParam& kern_param) { - const itype* A = kern_param.A(); - const itype* B = kern_param.B(); - uint8_t zp0, zp1; - if (have_zp) { - zp0 = kern_param.A_type.param().zero_point; - zp1 = kern_param.B_type.param().zero_point; - } - - otype* C = kern_param.C(); - for (size_t m = 0; m < kern_param.M; ++m) { - memset(C + m * kern_param.LDC, 0, sizeof(otype) * kern_param.N); - for (size_t k = 0; k < kern_param.K; ++k) - for (size_t n = 0; n < kern_param.N; ++n) { - if (!have_zp) - C[m * kern_param.LDC + n] += - static_cast(A[m * kern_param.LDA + k]) * - static_cast(B[k * kern_param.LDB + n]); - else { - C[m * kern_param.LDC + n] += - (static_cast(A[m * kern_param.LDA + k]) - - static_cast(zp0)) * - (static_cast(B[k * kern_param.LDB + n]) - - static_cast(zp1)); - } - } - } -} -} // anonymous namespace - bool MatrixMulImpl::AlgoGemv::usable( const KernSizeParam& kern_size_param) const { return !kern_size_param.trA && !kern_size_param.trB && diff --git a/dnn/src/fallback/matrix_mul/gemv.h b/dnn/src/fallback/matrix_mul/gemv.h new file mode 100644 index 00000000..5ea89270 --- /dev/null +++ b/dnn/src/fallback/matrix_mul/gemv.h @@ -0,0 +1,71 @@ +/** + * \file dnn/src/fallback/matrix_mul/gemv.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#pragma once + +#include "src/fallback/matrix_mul/opr_impl.h" + +namespace megdnn { +namespace fallback{ + +template +void gemv_like(const itype* A, const itype* B, otype* C, size_t M, size_t N, + size_t K, size_t LDA, size_t LDB, size_t LDC) { + for (size_t m = 0; m < M; ++m) { + memset(C + m * LDC, 0, sizeof(otype) * N); + for (size_t k = 0; k < K; ++k) + for (size_t n = 0; n < N; ++n) { + C[m * LDC + n] += static_cast(A[m * LDA + k]) * + static_cast(B[k * LDB + n]); + } + } +} + +template +void gemv_like(const itype* A, const itype* B, otype* C, size_t M, size_t N, + size_t K, size_t LDA, size_t LDB, size_t LDC, uint8_t zp0, + uint8_t zp1) { + for (size_t m = 0; m < M; ++m) { + memset(C + m * LDC, 0, sizeof(otype) * N); + for (size_t k = 0; k < K; ++k) + for (size_t n = 0; n < N; ++n) { + C[m * LDC + n] += (static_cast(A[m * LDA + k]) - + static_cast(zp0)) * + (static_cast(B[k * LDB + n]) - + static_cast(zp1)); + } + } +} + +template +void gemm_gemv_like(const MatrixMulImpl::KernParam& kern_param) { + const itype* A = kern_param.A(); + const itype* B = kern_param.B(); + otype* C = kern_param.C(); + size_t M = kern_param.M; + size_t N = kern_param.N; + size_t K = kern_param.K; + size_t LDA = kern_param.LDA; + size_t LDB = kern_param.LDB; + size_t LDC = kern_param.LDC; + + if (have_zp) { + uint8_t zp0 = kern_param.A_type.param().zero_point; + uint8_t zp1 = kern_param.B_type.param().zero_point; + gemv_like(A, B, C, M, N, K, LDA, LDB, LDC, zp0, zp1); + } + else { + gemv_like(A, B, C, M, N, K, LDA, LDB, LDC); + } +} + +} // namespace fallback +} // namespace megdnn \ No newline at end of file diff --git a/dnn/test/arm_common/conv_bias_multi_thread.cpp b/dnn/test/arm_common/conv_bias_multi_thread.cpp index d9f7bd72..c96735f2 100644 --- a/dnn/test/arm_common/conv_bias_multi_thread.cpp +++ b/dnn/test/arm_common/conv_bias_multi_thread.cpp @@ -1861,6 +1861,12 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F32) { #elif MEGDNN_ARMV7 check_conv_bias(args, handle(), "CONV1x1:ARMV7_F32:48"); #endif + std::vector gemv_args; + for (auto&& arg : args) + if(arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + check_conv_bias(gemv_args, handle(), "CONV1x1_GEMV"); } TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_MK4_PACK_F32) { @@ -1905,16 +1911,23 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_F16) { dtype::Float16{}, dtype::Float16{}, dtype::Float16{}, "CONV1x1:AARCH32_F16_K4X16X1:24"); #endif + std::vector gemv_args; + for (auto&& arg : args) + if(arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + check_conv_bias(gemv_args, handle(), "CONV1x1_GEMV"); } #endif TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_QUANTIZEDSYM) { UniformIntRNG rng{-50, 50}; float epsilon = 0.001; -#define cb(name) \ - checker_conv_bias(get_conv_bias_1x1_args(false, false, true, true), \ - handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ - dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ + std::vector args = + get_conv_bias_1x1_args(false, false, true, true); +#define cb(name) \ + checker_conv_bias(args, handle(), &rng, epsilon, dtype::QuantizedS8(2.5f), \ + dtype::QuantizedS8(2.5f), dtype::QuantizedS32(6.25f), \ dtype::QuantizedS8(60.25f), name); #if MEGDNN_AARCH64 #if __ARM_FEATURE_DOTPROD @@ -1928,17 +1941,27 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_QUANTIZEDSYM) { cb("CONV1x1:ARMV7_INT8X8X32_K4X8X8:48"); #endif #undef cb + std::vector gemv_args; + for (auto&& arg : args) + if (arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + checker_conv_bias(gemv_args, handle(), &rng, epsilon, + dtype::QuantizedS8(2.5f), dtype::QuantizedS8(2.5f), + dtype::QuantizedS32(6.25f), dtype::QuantizedS8(60.25f), + "CONV1x1_GEMV"); } #if MEGDNN_AARCH64 || MEGDNN_ARMV7 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_QUANTIZEDASYM) { - NormalRNG rng(128.f); -#define cb(name) \ - checker_conv_bias(get_conv_bias_1x1_args(false, false, true, true), \ - handle(), &rng, epsilon, \ - dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ - dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ - dtype::QuantizedS32(1.2 * 1.3), \ + UniformIntRNG rng{-50, 50}; + std::vector args = + get_conv_bias_1x1_args(false, false, true, true); +#define cb(name) \ + checker_conv_bias(args, handle(), &rng, epsilon, \ + dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ + dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ + dtype::QuantizedS32(1.2 * 1.3), \ dtype::Quantized8Asymm(50.3f, (uint8_t)120), name); float epsilon = 0.001; #if MEGDNN_AARCH64 @@ -1952,17 +1975,29 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_QUANTIZEDASYM) { cb("CONV1x1:ARMV7_QUINT8_K4X8X8:48"); #endif #undef cb + std::vector gemv_args; + for (auto&& arg : args) + if (arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + checker_conv_bias(gemv_args, handle(), &rng, epsilon, + dtype::Quantized8Asymm(1.2f, (uint8_t)125), + dtype::Quantized8Asymm(1.3f, (uint8_t)129), + dtype::QuantizedS32(1.2 * 1.3), + dtype::Quantized8Asymm(50.3f, (uint8_t)120), + "CONV1x1_GEMV"); } #endif #if MEGDNN_AARCH64 || MEGDNN_ARMV7 TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_QUINT8x8x32) { - UniformIntRNG rng{-50, 50}; + NormalRNG rng(128.f); float epsilon = 0.001; -#define cb(name) \ - checker_conv_bias(get_conv_bias_1x1_args(true, true), handle(), &rng, \ - epsilon, dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ - dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ + std::vector args = get_conv_bias_1x1_args(true, true); +#define cb(name) \ + checker_conv_bias(args, handle(), &rng, epsilon, \ + dtype::Quantized8Asymm(1.2f, (uint8_t)125), \ + dtype::Quantized8Asymm(1.3f, (uint8_t)129), \ dtype::QuantizedS32(1.2 * 1.3), {}, name); #if MEGDNN_AARCH64 @@ -1978,15 +2013,25 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_QUINT8x8x32) { cb("CONV1x1:ARMV7_QUINT8_K4X8X8:24"); #endif #undef cb + + std::vector gemv_args; + for (auto&& arg : args) + if (arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + checker_conv_bias(gemv_args, handle(), &rng, epsilon, + dtype::Quantized8Asymm(1.2f, (uint8_t)125), + dtype::Quantized8Asymm(1.3f, (uint8_t)129), + dtype::QuantizedS32(1.2 * 1.3), {}, "CONV1x1_GEMV"); } TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_1X1_S1_INT8x8x16) { UniformIntRNG rng{-50, 50}; float epsilon = 0.001; -#define cb(name) \ - checker_conv_bias(get_conv_bias_1x1_args(true, true), handle(), &rng, \ - epsilon, dtype::Int8{}, dtype::Int8{}, dtype::Int16{}, \ - dtype::Int16{}, name); + std::vector args = get_conv_bias_1x1_args(true, true); +#define cb(name) \ + checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, \ + dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, name); #if MEGDNN_AARCH64 cb("CONV1x1:AARCH64_INT8X8X16_K8X8X8:24"); @@ -1997,6 +2042,16 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONVBIAS_1X1_S1_INT8x8x16) { #endif cb("CONV1x1:ARM_COMMON_INT8X8X16:48"); #undef cb + + std::vector gemv_args; + for (auto&& arg : args) + if (arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + + checker_conv_bias(gemv_args, handle(), &rng, epsilon, dtype::Int8{}, + dtype::Int8{}, dtype::Int16{}, dtype::Int16{}, + "CONV1x1_GEMV"); } #endif @@ -2024,6 +2079,13 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_1X1_S1_INT8x8x32) { cb("CONV1x1:ARMV7_INT8X8X32_K4X2X16:48"); #endif #undef cb + + std::vector gemv_args; + for (auto&& arg : args) + if (arg.src.shape[2] == 1 && arg.src.shape[3] == 1) { + gemv_args.emplace_back(arg); + } + checker_conv_bias_mul_int8x8x32(gemv_args, handle(), "CONV1x1_GEMV"); } #ifndef __ARM_FEATURE_DOTPROD diff --git a/dnn/test/arm_common/matrix_mul.cpp b/dnn/test/arm_common/matrix_mul.cpp index ad190fc8..a2415898 100644 --- a/dnn/test/arm_common/matrix_mul.cpp +++ b/dnn/test/arm_common/matrix_mul.cpp @@ -254,6 +254,50 @@ TEST_F(ARM_COMMON, BENCHMARK_SGEMV) { for (size_t N : {512, 1024}) run(M, K, N); } + +TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP32) { + int exec_times = 50; + Benchmarker benchmarker(handle()); + benchmarker.set_times(exec_times); + benchmarker.set_before_exec_callback( + AlgoChecker("ARM_COMMON_F32_GEMV")); + + auto run = [&](size_t M, size_t K, size_t N) { + std::cout << "SGEMV: (" << M << ", " << K << ", " << N << ")" + << std::endl; + benchmarker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()); + auto time = benchmarker.exec({{M, K}, {K, N}, {}}) / exec_times; + auto computations = 2 * M * K * N * 1e-6; + auto perf = computations / time; + std::cout << "gemv fp32, Performance is " << perf << " Gflops" + << std::endl; + }; + + std::cout << "warm up:\n"; + for (int i = 0; i < 50; i++) { + benchmarker.set_dtype(0, dtype::Float32()) + .set_dtype(1, dtype::Float32()) + .set_dtype(2, dtype::Float32()) + .set_display(false) + .exec({{2, 1024}, {1024, 512}, {}}); + benchmarker.set_display(true); + } + + // run gemv + run(12, 48, 1); + run(48, 12, 1); + run(32, 128, 1); + run(128, 32, 1); + run(64, 256, 1); + run(256, 64, 1); + run(128, 512, 1); + run(512, 128, 1); + run(256, 1024, 1); + run(1024, 256, 1); +} + TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) { int exec_times = 50; Benchmarker benchmarker(handle()); @@ -290,6 +334,7 @@ TEST_F(ARM_COMMON, BENCHMARK_SGEMV_FP16) { for (size_t N : {512, 1024}) run(M, K, N); } + TEST_F(ARM_COMMON, BENCHMARK_SGEMM) { int exec_times = 10; Benchmarker benchmarker(handle()); diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index 1a8d6a12..4f157dac 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -1081,7 +1081,7 @@ std::vector get_conv_bias_1x1_args( for (size_t n : {1, 2}) for (size_t oc : {1, 9, 33}) for (size_t ic : {1, 16, 64}) - for (size_t size : {7, 14, 28}) + for (size_t size : {1, 7, 14, 28}) for (auto nlmode : nonlinemode) for (auto convmode : convmodes) { pack(n, oc, ic, size, size, 1, nlmode, convmode);