|
- /**
- * \file dnn/src/fallback/conv_bias/im2col/algos.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-
- #include "src/fallback/conv_bias/im2col/algos.h"
- #include "megdnn/opr_param_defs.h"
- #include "src/common/opr_delegate.h"
- #include "src/fallback/conv_bias/common.h"
- #include "src/fallback/conv_bias/opr_impl.h"
- #include "src/fallback/conv_bias/winograd/strategy.h"
- #include "src/fallback/convolution/img2col_helper.h"
- #include "src/naive/convolution/helper.h"
- #if MEGDNN_X86
- #include "src/x86/conv_bias/postprocess_helper.h"
- #endif
- #include "midout.h"
- MIDOUT_DECL(megdnn_fallback_im2col)
-
- using namespace megdnn;
- using namespace fallback;
-
- #if MEGDNN_X86
- using namespace x86;
- #endif
-
- /*======================== AlgoIm2col=======================*/
- /*!
- * *\brief The index of all parts workspace in im2col workspace bundel
- * *Through witch can convenient get the needed ptr
- */
- struct Im2colBundelIndex {
- static constexpr size_t BUNDLE_PADDING_INDEX = 0_z;
- static constexpr size_t BUNDLE_PACKA_INDEX = 1_z;
- static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
- static constexpr size_t THREAD_BUNDLE_PACKB_INDEX = 0_z;
- static constexpr size_t THREAD_BUNDLE_IM2COL_INDEX = 1_z;
- static constexpr size_t THREAD_BUNDLE_MATMUL_DST_INDEX = 2_z;
- static constexpr size_t THREAD_BUNDLE_BIAS_INDEX = 3_z;
- static constexpr size_t THREAD_BUNDLE_COMPUTE_INDEX = 4_z;
- };
-
- /*!
- * *\brief PtrGetter is get the im2col needed ptr according to the provided
- * *conditions
- */
- class PtrGetter {
- public:
- template <typename dtype>
- static inline dtype* get_matmul_dst_ptr(
- const ConvBiasImpl::NCBKernParam& param,
- const WorkspaceBundle& bundle_thread, size_t bundle_id,
- size_t oc_cur_index, size_t OHW, bool is_dst_8bit,
- bool ohw_bigger_ohwblock, size_t batch_id, size_t group_id) {
- if (is_dst_8bit || !ohw_bigger_ohwblock) {
- return static_cast<dtype*>(bundle_thread.get(bundle_id));
- } else {
- dtype* dst =
- param.dst<dtype>(batch_id, group_id) + oc_cur_index * OHW;
- return static_cast<dtype*>(dst);
- }
- }
-
- template <typename bias_ctype>
- static inline bias_ctype* get_bias_temp_ptr(
- const ConvBiasImpl::NCBKernParam& param,
- const WorkspaceBundle& bundle_thread) {
- bias_ctype* bias_tmp_ptr =
- param.bias_mode == megdnn::BiasMode::BIAS
- ? static_cast<bias_ctype*>(bundle_thread.get(
- Im2colBundelIndex::THREAD_BUNDLE_BIAS_INDEX))
- : nullptr;
- return bias_tmp_ptr;
- }
-
- template <typename dtype>
- static inline dtype* get_bundle_offset_byte_ptr(
- const WorkspaceBundle& bundle, size_t bundle_id, size_t offset) {
- return reinterpret_cast<dtype*>(
- reinterpret_cast<uintptr_t>(bundle.get(bundle_id)) + offset);
- }
- };
-
- using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
-
- //! Process one input channel copy padding
- template <typename src_ctype>
- static void copy_padding_kern(WorkspaceBundle bundle,
- const ConvBiasImpl::NCBKernParam& param,
- ConvBiasImpl::NCBKernIndex ncb_index) {
- UNPACK_CONV_F32_NCB_KERN_SIZES(param);
- MEGDNN_MARK_USED_VAR(N);
- MEGDNN_MARK_USED_VAR(OC);
- MEGDNN_MARK_USED_VAR(OH);
- MEGDNN_MARK_USED_VAR(OW);
- MEGDNN_MARK_USED_VAR(FH);
- MEGDNN_MARK_USED_VAR(FW);
- MEGDNN_MARK_USED_VAR(SH);
- MEGDNN_MARK_USED_VAR(SW);
-
- size_t IW2 = IW + 2 * PW;
- size_t IH2 = IH + 2 * PH;
- size_t group_id = ncb_index.ndrange_id[0];
- size_t batch_id = ncb_index.ndrange_id[1];
- size_t channel_id = ncb_index.ndrange_id[2];
-
- size_t padding_group_size = IH2 * IW2 * IC;
- size_t workspace_channel_offset = IH2 * IW2 * channel_id;
- size_t workspace_group_offset = group_id * padding_group_size;
- size_t workspace_batch_offset =
- param.filter_meta.group * batch_id * padding_group_size;
- bundle.set(param.workspace_ptr);
-
- src_ctype src_zp = static_cast<src_ctype>(0);
- if (param.src_type.enumv() == DTypeEnum::Quantized8Asymm) {
- src_zp = param.src_type.param<dtype::Quantized8Asymm>().zero_point;
- }
- src_ctype* src = const_cast<src_ctype*>(
- param.src<src_ctype>(batch_id, group_id, channel_id));
- src_ctype* src2;
- src2 = static_cast<src_ctype*>(
- bundle.get(Im2colBundelIndex::BUNDLE_PADDING_INDEX)) +
- workspace_group_offset + workspace_batch_offset +
- workspace_channel_offset;
- src_ctype* src2_ptr = src2;
- const src_ctype* src_ptr = src;
- if (PH != 0) {
- std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
- src2_ptr += PH * IW2;
- }
- rep(ih, IH) {
- if (PW != 0)
- rep(pw, PW) * (src2_ptr++) = src_zp;
- std::memcpy(src2_ptr, src_ptr, sizeof(src_ctype) * IW);
- src2_ptr += IW;
- src_ptr += IW;
- if (PW != 0)
- rep(pw, PW) * (src2_ptr++) = src_zp;
- }
- if (PH != 0) {
- std::memset(src2_ptr, src_zp, sizeof(src_ctype) * PH * IW2);
- src2_ptr += PH * IW2;
- }
- };
-
- /*!
- * *\brief Im2colKerns collects all the im2col kerns in it
- */
-
- #define COPY_BIAS() \
- const bias_ctype* bias_ptr = static_cast<const bias_ctype*>( \
- param.bias<bias_ctype>(batch_id, group_id)); \
- bias_ctype* bias_temp_ptr = \
- PtrGetter::get_bias_temp_ptr<bias_ctype>(param, bundle_thread); \
- if (param.bias_mode == megdnn::BiasMode::BIAS) { \
- bias_ctype* copy_dst = bias_temp_ptr; \
- const bias_ctype* copy_src = \
- bias_ptr + oc_cur_index * OH * OW + ohw_cur_index; \
- for (size_t oc = oc_cur_index; oc < oc_end_index; oc++) { \
- std::memcpy(copy_dst, copy_src, \
- sizeof(bias_ctype) * output_block_size); \
- copy_dst += output_block_size; \
- copy_src += OH * OW; \
- } \
- }
-
- #define IM2COL() \
- src_ctype* im2col_dst = nullptr; \
- src_ctype* no_padding_src = \
- const_cast<src_ctype*>(param.src<src_ctype>(batch_id, group_id)) + \
- ohw_cur_index; \
- if (!special_1x1) { \
- size_t padding_group_size = IH2 * IW2 * IC * sizeof(src_ctype); \
- src_ctype* src2 = PtrGetter::get_bundle_offset_byte_ptr<src_ctype>( \
- bundle, Im2colBundelIndex::BUNDLE_PADDING_INDEX, \
- (ncb_index.ndrange_id[0] + \
- param.filter_meta.group * ncb_index.ndrange_id[1]) * \
- padding_group_size); \
- if (PH == 0 && PW == 0) { \
- src2 = const_cast<src_ctype*>( \
- param.src<src_ctype>(batch_id, group_id)); \
- } \
- im2col_dst = static_cast<src_ctype*>(bundle_thread.get( \
- Im2colBundelIndex::THREAD_BUNDLE_IM2COL_INDEX)); \
- if (SH == 1 && SW == 1) { \
- if (is_xcorr) { \
- img2col<true>(src2, im2col_dst, OC, OH, OW, IC, IH2, IW2, FH, \
- FW, ohw_cur_index, output_block_size); \
- } else { \
- img2col<false>(src2, im2col_dst, OC, OH, OW, IC, IH2, IW2, FH, \
- FW, ohw_cur_index, output_block_size); \
- } \
- } else { \
- if (is_xcorr) { \
- img2col_stride<true>(src2, im2col_dst, OC, OH, OW, IC, IH2, \
- IW2, FH, FW, SH, SW, ohw_cur_index, \
- output_block_size); \
- } else { \
- img2col_stride<false>(src2, im2col_dst, OC, OH, OW, IC, IH2, \
- IW2, FH, FW, SH, SW, ohw_cur_index, \
- output_block_size); \
- } \
- } \
- }
-
- #define POSTPROCESS_AND_COPYDST() \
- PostProcess<op_ctype, op_dtype, postprocess_mode>::run( \
- matmul_dst, \
- param.bias_mode == megdnn::BiasMode::BIAS \
- ? bias_temp_ptr \
- : const_cast<bias_ctype*>(bias_ptr + oc_cur_index), \
- matmul_dst, param.bias_mode, param.nonlineMode, param.bias_type, \
- param.dst_type, 1_z, output_block_oc_size, 1_z, \
- output_block_size); \
- if (!skip_copy_dst) { \
- dst_ctype* dst_tmp_ptr = reinterpret_cast<dst_ctype*>(matmul_dst); \
- dst_ctype* dst = param.dst<dst_ctype>(batch_id, group_id) + \
- oc_cur_index * OHW + ohw_cur_index; \
- for (size_t oc = 0; oc < output_block_oc_size; oc++) { \
- std::memcpy(dst, dst_tmp_ptr, \
- sizeof(dst_ctype) * output_block_size); \
- dst_tmp_ptr += output_block_size; \
- dst += OHW; \
- } \
- }
-
- #define PREPAR_MATMUL_DATA() \
- size_t packA_per_oc_block_size = \
- round_up(matmul_param.K, matmul_algo->get_inner_block_size().k) * \
- oc_tile_size * matmul_algo->get_packA_type_size(); \
- size_t packA_group_size = \
- matmul_algo->get_bundle(matmul_param).get_size(0); \
- src_ctype* a_panel = PtrGetter::get_bundle_offset_byte_ptr<src_ctype>( \
- bundle, Im2colBundelIndex::BUNDLE_PACKA_INDEX, \
- ncb_index.ndrange_id[0] * packA_group_size + \
- ncb_index.ndrange_id[3] * packA_per_oc_block_size); \
- src_ctype* b_panel = PtrGetter::get_bundle_offset_byte_ptr<src_ctype>( \
- bundle_thread, Im2colBundelIndex::THREAD_BUNDLE_PACKB_INDEX, 0); \
- /*In pack mode, the matmul dst and im2col dst is the same workspace*/ \
- bias_ctype* matmul_dst = PtrGetter::get_matmul_dst_ptr<bias_ctype>( \
- param, bundle_thread, \
- Im2colBundelIndex::THREAD_BUNDLE_IM2COL_INDEX, oc_cur_index, OHW, \
- is_dst_8bit, is_ohw_size_bigger, batch_id, group_id);
-
- #define MATMUL_COMPUTE() \
- auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param); \
- matmul_param.M = output_block_oc_size; \
- matmul_param.N = output_block_size; \
- matmul_param.LDB = special_1x1 ? OH * OW : output_block_size; \
- matmul_param.LDC = output_block_size; \
- matmul_param.A_ptr = a_panel; \
- matmul_param.B_ptr = im2col_dst ? im2col_dst : no_padding_src; \
- matmul_param.C_ptr = matmul_dst; \
- matmul_algo->pack_B(matmul_param, b_panel, 0, output_block_size); \
- matmul_kern_naked(matmul_param, a_panel, b_panel);
-
- template <Pack_Mode packmode>
- class Im2colKerns;
-
- template <>
- class Im2colKerns<Pack_Mode::DEFAULT> {
- public:
- //! packA kern
- template <typename src_ctype>
- static void packA_kern(WorkspaceBundle bundle,
- const ConvBiasImpl::NCBKernParam& param,
- fallback::MatrixMulImpl::KernSizeParam matmulparam,
- fallback::MatrixMulImpl::AlgoBase* matmul_algo,
- ConvBiasImpl::NCBKernIndex ncb_index) {
- bundle.set(param.workspace_ptr);
- fallback::MatrixMulImpl::KernParam matmul_param;
- size_t group_id = ncb_index.ndrange_id[0];
- static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
- matmulparam;
- size_t packA_group_size =
- matmul_algo->get_bundle(matmul_param).get_size(0);
- size_t packed_per_oc_block_size =
- round_up(matmul_param.K,
- matmul_algo->get_inner_block_size().k) *
- matmul_algo->get_inner_block_size().m *
- matmul_algo->get_packA_type_size();
- size_t a_panel_offset =
- ncb_index.ndrange_id[2] * packed_per_oc_block_size;
- int8_t* a_panel = static_cast<int8_t*>(bundle.get(
- Im2colBundelIndex::BUNDLE_PACKA_INDEX)) +
- group_id * packA_group_size + a_panel_offset;
- matmul_param.A_ptr =
- const_cast<src_ctype*>(param.filter<src_ctype>(group_id));
- matmul_algo->pack_A(matmul_param, a_panel, ncb_index.ndrange_id[2],
- matmul_algo->get_inner_block_size().m);
- };
-
- //! conv kernel
- template <typename src_ctype, typename bias_ctype, typename dst_ctype,
- typename op_ctype, typename op_dtype,
- PostprocessMode postprocess_mode>
- static void kerns(
- WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
- const ConvBiasImpl::NCBKernParam& param,
- fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
- fallback::MatrixMulImpl::AlgoBase* matmul_algo,
- fallback::ConvBiasImpl::NCBKernIndex ncb_index,
- size_t ohw_tile_size, size_t oc_tile_size) {
- auto is_xcorr = !param.filter_meta.should_flip;
- UNPACK_CONV_F32_NCB_KERN_SIZES(param);
- MEGDNN_MARK_USED_VAR(N);
- auto IH2 = IH + 2 * PH;
- auto IW2 = IW + 2 * PW;
- size_t OHW = OH * OW;
- size_t group_id = ncb_index.ndrange_id[0];
- size_t batch_id = ncb_index.ndrange_id[1];
- size_t output_block_size = std::min(
- ohw_tile_size, OHW - ncb_index.ndrange_id[2] * ohw_tile_size);
- size_t output_block_oc_size = std::min(
- oc_tile_size, OC - ncb_index.ndrange_id[3] * oc_tile_size);
-
- //! misc flags
- bool special_1x1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 &&
- PH == 0 && PW == 0);
- bool is_dst_8bit =
- (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
- param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
- (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
- param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
- bool is_ohw_size_bigger = (ohw_tile_size >= OHW);
- bool skip_copy_dst = is_ohw_size_bigger && !is_dst_8bit;
-
- //! misc index
- size_t ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size;
- size_t oc_cur_index = ncb_index.ndrange_id[3] * oc_tile_size;
- size_t oc_end_index = oc_cur_index + output_block_oc_size;
-
- bundle.set(param.workspace_ptr);
- bundle_thread.set(PtrGetter::get_bundle_offset_byte_ptr<int8_t>(
- bundle, Im2colBundelIndex::BUNDLE_THREAD_INDEX,
- bundle_thread.total_size_in_bytes() * ncb_index.thread_id));
-
- fallback::MatrixMulImpl::KernParam matmul_param;
- static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
- matmul_kernsize_param;
- matmul_param.workspace_ptr = bundle_thread.get(
- Im2colBundelIndex::THREAD_BUNDLE_COMPUTE_INDEX);
-
- //! 1.Copy bias if need
- COPY_BIAS();
-
- //! 2.Im2col
- IM2COL();
-
- //! 3.packb and matmul compute
- PREPAR_MATMUL_DATA();
- MATMUL_COMPUTE();
-
- //! 4.postprocess and copy dst if need
- POSTPROCESS_AND_COPYDST();
- #undef PREPAR_MATMUL_DATA
- #undef MATMUL_COMPUTE
- }
- };
-
- #define PREPAR_MATMUL_DATA() \
- bias_ctype* matmul_dst = nullptr; \
- src_ctype* b_panel = nullptr; \
- size_t packA_group_size = \
- bundle.get_size(Im2colBundelIndex::BUNDLE_PACKA_INDEX) / \
- param.filter_meta.group; \
- size_t a_panel_offset = ncb_index.ndrange_id[3] * \
- matmul_algo->get_bundle(matmul_param).get_size(0); \
- \
- src_ctype* a_panel = PtrGetter::get_bundle_offset_byte_ptr<src_ctype>( \
- bundle, Im2colBundelIndex::BUNDLE_PACKA_INDEX, \
- group_id * packA_group_size + a_panel_offset); \
- matmul_dst = PtrGetter::get_matmul_dst_ptr<bias_ctype>( \
- param, bundle_thread, \
- Im2colBundelIndex::THREAD_BUNDLE_MATMUL_DST_INDEX, oc_cur_index, \
- OHW, is_dst_8bit, is_ohw_size_bigger, batch_id, group_id);
-
- #define MATMUL_COMPUTE() \
- auto matmul_kern_naked = matmul_algo->get_kern_naked(matmul_param); \
- matmul_param.M = output_block_oc_size; \
- matmul_param.N = output_block_size; \
- matmul_param.LDB = special_1x1 ? OH * OW : output_block_size; \
- matmul_param.LDC = output_block_size; \
- matmul_param.A_ptr = a_panel; \
- matmul_param.B_ptr = im2col_dst ? im2col_dst : no_padding_src; \
- matmul_param.C_ptr = matmul_dst; \
- matmul_kern_naked(matmul_param, a_panel, b_panel);
-
- template <>
- class Im2colKerns<Pack_Mode::ONLY_PACKA> {
- public:
- //! packA kern
- template <typename src_ctype>
- static void packA_kern(WorkspaceBundle bundle,
- const ConvBiasImpl::NCBKernParam& param,
- fallback::MatrixMulImpl::KernSizeParam matmulparam,
- fallback::MatrixMulImpl::AlgoBase* matmul_algo,
- ConvBiasImpl::NCBKernIndex ncb_index) {
- bundle.set(param.workspace_ptr);
- fallback::MatrixMulImpl::KernParam matmul_param;
- static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
- matmulparam;
- size_t OC = param.filter_meta.ocpg;
- size_t oc_tile_size = matmul_param.M;
- size_t group_id = ncb_index.ndrange_id[0];
- size_t output_block_oc_size = std::min(
- oc_tile_size, OC - ncb_index.ndrange_id[2] * oc_tile_size);
- size_t oc_cur_index = ncb_index.ndrange_id[2] * oc_tile_size;
- size_t packA_group_size =
- bundle.get_size(Im2colBundelIndex::BUNDLE_PACKA_INDEX) /
- param.filter_meta.group;
- size_t a_panel_offset =
- ncb_index.ndrange_id[2] *
- matmul_algo->get_bundle(matmul_param).get_size(0);
- int8_t* a_panel = static_cast<int8_t*>(bundle.get(
- Im2colBundelIndex::BUNDLE_PACKA_INDEX)) +
- group_id * packA_group_size + a_panel_offset;
- matmul_param.A_ptr =
- const_cast<src_ctype*>(param.filter<src_ctype>(group_id)) +
- oc_cur_index * matmul_param.K;
- matmul_param.M = output_block_oc_size;
- matmul_algo->pack_A(matmul_param, a_panel, 0_z, 0_z);
- };
-
- //! conv kernel
- template <typename src_ctype, typename bias_ctype, typename dst_ctype,
- typename op_ctype, typename op_dtype,
- PostprocessMode postprocess_mode>
- static void kerns(
- WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
- const ConvBiasImpl::NCBKernParam& param,
- fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
- fallback::MatrixMulImpl::AlgoBase* matmul_algo,
- fallback::ConvBiasImpl::NCBKernIndex ncb_index,
- size_t ohw_tile_size, size_t oc_tile_size) {
- auto is_xcorr = !param.filter_meta.should_flip;
- UNPACK_CONV_F32_NCB_KERN_SIZES(param);
- MEGDNN_MARK_USED_VAR(N);
- auto IH2 = IH + 2 * PH;
- auto IW2 = IW + 2 * PW;
- size_t group_id = ncb_index.ndrange_id[0];
- size_t batch_id = ncb_index.ndrange_id[1];
- size_t OHW = OH * OW;
- size_t output_block_size = std::min(
- ohw_tile_size, OHW - ncb_index.ndrange_id[2] * ohw_tile_size);
- size_t output_block_oc_size = std::min(
- oc_tile_size, OC - ncb_index.ndrange_id[3] * oc_tile_size);
-
- //! misc flags
- bool special_1x1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 &&
- PH == 0 && PW == 0);
- bool is_dst_8bit =
- (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
- param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
- (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
- param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
- bool is_ohw_size_bigger = (ohw_tile_size >= OHW);
- bool skip_copy_dst = is_ohw_size_bigger && !is_dst_8bit;
-
- //! misc index
- size_t ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size;
- size_t oc_cur_index = ncb_index.ndrange_id[3] * oc_tile_size;
- size_t oc_end_index = oc_cur_index + output_block_oc_size;
-
- bundle.set(param.workspace_ptr);
- bundle_thread.set(PtrGetter::get_bundle_offset_byte_ptr<int8_t>(
- bundle, Im2colBundelIndex::BUNDLE_THREAD_INDEX,
- bundle_thread.total_size_in_bytes() * ncb_index.thread_id));
-
- fallback::MatrixMulImpl::KernParam matmul_param;
- static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
- matmul_kernsize_param;
- matmul_param.workspace_ptr = bundle_thread.get(
- Im2colBundelIndex::THREAD_BUNDLE_COMPUTE_INDEX);
-
- //! 1.Copy bias if need
- COPY_BIAS();
-
- //! 2.Im2col
- IM2COL();
-
- //! 3.packb and matmul compute
- PREPAR_MATMUL_DATA();
- MATMUL_COMPUTE();
-
- //! 4.postprocess and copy dst if need
- POSTPROCESS_AND_COPYDST();
- #undef PREPAR_MATMUL_DATA
- #undef MATMUL_COMPUTE
- }
- };
-
- #define PREPAR_MATMUL_DATA() \
- bias_ctype* matmul_dst = nullptr; \
- const src_ctype* filter = \
- param.filter<src_ctype>(group_id) + oc_cur_index * IC * FH * FW; \
- matmul_dst = PtrGetter::get_matmul_dst_ptr<bias_ctype>( \
- param, bundle_thread, \
- Im2colBundelIndex::THREAD_BUNDLE_MATMUL_DST_INDEX, oc_cur_index, \
- OHW, is_dst_8bit, is_ohw_size_bigger, batch_id, group_id);
-
- #define MATMUL_COMPUTE() \
- matmul_param.M = output_block_oc_size; \
- matmul_param.N = output_block_size; \
- matmul_param.LDB = special_1x1 ? OH * OW : output_block_size; \
- matmul_param.LDC = output_block_size; \
- matmul_param.A_ptr = filter; \
- matmul_param.B_ptr = im2col_dst ? im2col_dst : no_padding_src; \
- matmul_param.C_ptr = matmul_dst; \
- auto matmul_kern_t = matmul_algo->get_kern(matmul_param); \
- matmul_kern_t(matmul_param);
-
- template <>
- class Im2colKerns<Pack_Mode::NO_PACK> {
- public:
- //! conv kernel
- template <typename src_ctype, typename bias_ctype, typename dst_ctype,
- typename op_ctype, typename op_dtype,
- PostprocessMode postprocess_mode>
- static void kerns(
- WorkspaceBundle bundle, WorkspaceBundle bundle_thread,
- const ConvBiasImpl::NCBKernParam& param,
- fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
- fallback::MatrixMulImpl::AlgoBase* matmul_algo,
- fallback::ConvBiasImpl::NCBKernIndex ncb_index,
- size_t ohw_tile_size, size_t oc_tile_size) {
- auto is_xcorr = !param.filter_meta.should_flip;
- UNPACK_CONV_F32_NCB_KERN_SIZES(param);
- MEGDNN_MARK_USED_VAR(N);
- auto IH2 = IH + 2 * PH;
- auto IW2 = IW + 2 * PW;
- size_t group_id = ncb_index.ndrange_id[0];
- size_t batch_id = ncb_index.ndrange_id[1];
- size_t OHW = OH * OW;
- size_t output_block_size = std::min(
- ohw_tile_size, OHW - ncb_index.ndrange_id[2] * ohw_tile_size);
- size_t output_block_oc_size = std::min(
- oc_tile_size, OC - ncb_index.ndrange_id[3] * oc_tile_size);
- //! misc flags
- bool special_1x1 = (FH == 1 && FW == 1 && SH == 1 && SW == 1 &&
- PH == 0 && PW == 0);
- bool is_dst_8bit =
- (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
- param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
- (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
- param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
- bool is_ohw_size_bigger = (ohw_tile_size >= OHW);
- bool skip_copy_dst = is_ohw_size_bigger && !is_dst_8bit;
-
- //! misc index
- size_t ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size;
- size_t oc_cur_index = ncb_index.ndrange_id[3] * oc_tile_size;
- size_t oc_end_index = oc_cur_index + output_block_oc_size;
-
- bundle.set(param.workspace_ptr);
- bundle_thread.set(PtrGetter::get_bundle_offset_byte_ptr<int8_t>(
- bundle, Im2colBundelIndex::BUNDLE_THREAD_INDEX,
- bundle_thread.total_size_in_bytes() * ncb_index.thread_id));
-
- fallback::MatrixMulImpl::KernParam matmul_param;
- static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
- matmul_kernsize_param;
- matmul_param.workspace_ptr = bundle_thread.get(
- Im2colBundelIndex::THREAD_BUNDLE_COMPUTE_INDEX);
-
- //! 1.Copy bias if need
- COPY_BIAS();
-
- //! 2.Im2col
- IM2COL();
-
- //! 3.packb and matmul compute
- PREPAR_MATMUL_DATA();
- MATMUL_COMPUTE();
-
- //! 4.postprocess and copy dst if need
- POSTPROCESS_AND_COPYDST();
-
- #undef PREPAR_MATMUL_DATA
- #undef MATMUL_COMPUTE
- }
- };
-
- #undef COPY_BIAS
- #undef IM2COL
- #undef POSTPROCESS_AND_COPYDST
- fallback::MatrixMulImpl::KernSizeParam
- ConvBiasImpl::AlgoIm2col ::get_matmul_kern_param(const NCBKernSizeParam& param,
- size_t ohw_tile_size,
- size_t oc_tile_size) const {
- size_t M = oc_tile_size;
- size_t N = ohw_tile_size;
- size_t K = param.filter_meta.icpg * param.filter_meta.spatial[0] *
- param.filter_meta.spatial[1];
- size_t LDA = K, LDB = N, LDC = N;
- bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
- param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
- (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
- param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
- return {param.filter_type,
- param.src_type,
- is_dst_8bit ? param.bias_type : param.dst_type,
- M,
- N,
- K,
- LDA,
- LDB,
- LDC,
- false,
- false,
- param::MatrixMul::ComputeMode::DEFAULT,
- param::MatrixMul::Format::DEFAULT};
- }
-
- void ConvBiasImpl::AlgoIm2col::choice_ohw_oc_block(
- const NCBKernSizeParam& param, size_t block_m, size_t block_n,
- bool need_pack) const {
- size_t nr_threads = param.nr_threads;
- size_t OC = param.filter_meta.ocpg;
- size_t ohw = param.osz[0] * param.osz[1];
- //! pay attention please, should not change the 2 line code,
- //! the opr use the same im2col algo, via choice_ohw_oc_block may change the
- //! m_ohw_tile_size and m_oc_tile_size, if the two value changed, the
- //! workspace size may change, will ocur workspace not match problem, so
- //! should use the original data init them to avoid the problem
- m_oc_tile_size = DEFAULT_OC_TILE_SIZE;
- m_ohw_tile_size = m_ohw_tile_origin;
-
- m_oc_tile_size = std::min(m_oc_tile_size, OC);
- m_ohw_tile_size = std::min(m_ohw_tile_size, ohw);
-
- if (nr_threads > 1) {
- if (ohw / m_ohw_tile_size < nr_threads) {
- m_ohw_tile_size = round_up(div_ceil(ohw, nr_threads), block_n);
- if (m_ohw_tile_size < DEFAULT_OHW_MIN_TILE_SIZE) {
- m_ohw_tile_size = ohw;
- m_oc_tile_size = round_up(div_ceil(OC, nr_threads), block_m);
- if (m_oc_tile_size > DEFAULT_OC_MAX_TILE_SIZE) {
- m_oc_tile_size = DEFAULT_OC_MAX_TILE_SIZE;
- } else if (m_oc_tile_size < DEFAULT_OC_MIN_TILE_SIZE) {
- m_oc_tile_size = DEFAULT_OC_MIN_TILE_SIZE;
- }
- }
- }
- } else {
- if (!need_pack) { //! no pack ,usually in x86 save memroy
- m_ohw_tile_size = ohw;
- m_oc_tile_size = OC;
- }
- }
- }
-
- WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_bundle(
- const NCBKernSizeParam& param) const {
- UNPACK_CONV_F32_NCB_KERN_SIZES(param);
- MEGDNN_MARK_USED_VAR(OC);
- MEGDNN_MARK_USED_VAR(OH);
- MEGDNN_MARK_USED_VAR(OW);
- MEGDNN_MARK_USED_VAR(FH);
- MEGDNN_MARK_USED_VAR(FW);
- MEGDNN_MARK_USED_VAR(SW);
- MEGDNN_MARK_USED_VAR(SH);
-
- auto IW2 = IH + 2 * PH;
- auto IH2 = IW + 2 * PW;
- bool no_need_pading = (PH == 0 && PW == 0);
- size_t padding = 0, packa_size = 0, packa_group_size = 0;
- size_t nr_threads = param.nr_threads;
- size_t GROUP = param.filter_meta.group;
- bool need_pack = m_matmul_algo->packmode() == Pack_Mode::DEFAULT;
- bool only_packA = m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
- if (need_pack || only_packA) {
- auto inner_block = m_matmul_algo->get_inner_block_size();
- choice_ohw_oc_block(param, inner_block.m, inner_block.n, need_pack);
- auto im2col_kern_param = get_matmul_kern_param(
- param, m_ohw_tile_size, only_packA ? m_oc_tile_size : OC);
- size_t oc_parallel_times = div_ceil<size_t>(OC, m_oc_tile_size);
- WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param);
- packa_group_size = only_packA ? oc_parallel_times * wb.get_size(0)
- : wb.get_size(0);
- } else { //! not support pack,not need pack
- size_t nopack_default_blockm = 8;
- size_t nopack_default_blockn = 16;
- choice_ohw_oc_block(param, nopack_default_blockm, nopack_default_blockn,
- need_pack);
- packa_group_size = 0;
- }
- if (no_need_pading) {
- padding = 0; //! not need padding
- } else {
- padding = (GROUP * N * IC * IH2 * IW2) *
- sizeof(param.src_type); //! for padding
- }
- packa_size = GROUP * packa_group_size; //! for packA size = GROUP * a_size
- WorkspaceBundle ws = get_thread_bundle(param);
- return {nullptr,
- {padding, packa_size, ws.total_size_in_bytes() * nr_threads}};
- }
-
- WorkspaceBundle ConvBiasImpl::AlgoIm2col::get_thread_bundle(
- const NCBKernSizeParam& param) const {
- size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
- FW = param.filter_meta.spatial[1];
- size_t ohw = param.osz[0] * param.osz[1];
-
- size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0,
- matmul_compute = 0;
- auto im2col_kern_param =
- get_matmul_kern_param(param, m_ohw_tile_size, m_oc_tile_size);
- bool default_pack = m_matmul_algo->packmode() == Pack_Mode::DEFAULT;
- bool only_packA = m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
- bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
- param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
- (param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
- param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
- size_t im2col_dst_size =
- IC * FH * FW * m_ohw_tile_size * sizeof(param.src_type);
- size_t matmul_dst_size =
- m_oc_tile_size * m_ohw_tile_size * sizeof(param.bias_type);
- if (default_pack || only_packA) {
- //! matmul_dst and im2col_dst use the same memory
- WorkspaceBundle wb = m_matmul_algo->get_bundle(im2col_kern_param);
- packb = wb.get_size(1);
- im2col = only_packA ? im2col_dst_size
- : std::max(im2col_dst_size, matmul_dst_size);
- matmul_dst = only_packA ? matmul_dst_size : 0;
- } else {
- im2col = im2col_dst_size;
- if (is_dst_8bit) {
- matmul_dst = matmul_dst_size;
- } else {
- matmul_dst = m_ohw_tile_size >= ohw ? 0 : matmul_dst_size;
- }
- matmul_compute = m_matmul_algo->get_workspace(im2col_kern_param);
- }
- if (param.bias_mode == megdnn::BiasMode::BIAS) {
- bias_temp = m_oc_tile_size * m_ohw_tile_size * sizeof(param.bias_type);
- }
- return {nullptr, {packb, im2col, matmul_dst, bias_temp, matmul_compute}};
- }
-
- size_t ConvBiasImpl::AlgoIm2col::get_workspace(
- ConvBiasImpl*, const NCBKernSizeParam& p) const {
- MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 0) {
- return get_bundle(p).total_size_in_bytes();
- }
- MIDOUT_END();
- return 0;
- }
-
- SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
- ConvBiasImpl*, const NCBKernSizeParam& param) const {
- MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1) {
- size_t ohw = param.osz[0] * param.osz[1];
- size_t ohw_parallel_times = div_ceil(ohw, m_ohw_tile_size);
- size_t GROUP = param.filter_meta.group;
- size_t IC = param.filter_meta.icpg;
- size_t OC = param.filter_meta.ocpg;
- size_t PH = param.filter_meta.padding[0];
- size_t PW = param.filter_meta.padding[1];
-
- WorkspaceBundle bundle = get_bundle(param);
- WorkspaceBundle bundle_thread = get_thread_bundle(param);
-
- size_t oc_parallel_times = div_ceil(OC, m_oc_tile_size);
- bool need_padding = (PH != 0 || PW != 0);
- bool default_pack = m_matmul_algo->packmode() == Pack_Mode::DEFAULT;
- bool no_pack = m_matmul_algo->packmode() == Pack_Mode::NO_PACK;
- bool only_packA = m_matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
- size_t packa_parallel_times = 0;
- if (only_packA) {
- packa_parallel_times = div_ceil(OC, m_oc_tile_size);
- } else if (default_pack) {
- packa_parallel_times =
- div_ceil(OC, m_matmul_algo->get_inner_block_size().m);
- }
-
- auto matmul_param = get_matmul_kern_param(
- param, m_ohw_tile_size, only_packA ? m_oc_tile_size : OC);
-
- SmallVector<ConvBiasImpl::NCBKern> ret_kern;
-
- #define RETURN_KERNS() \
- if (default_pack) { \
- ret_kern.push_back( \
- {kern_default_packA, {GROUP, 1_z, packa_parallel_times}}); \
- } \
- if (only_packA) { \
- ret_kern.push_back( \
- {kern_only_packA, {GROUP, 1_z, packa_parallel_times}}); \
- } \
- if (need_padding) { \
- ret_kern.push_back({kern_padding, {GROUP, param.n, IC}}); \
- } \
- if (default_pack) { \
- ret_kern.push_back( \
- {kern_compute_default, \
- {GROUP, param.n, ohw_parallel_times, oc_parallel_times}}); \
- } \
- if (no_pack) { \
- ret_kern.push_back( \
- {kern_compute_nopack, \
- {GROUP, param.n, ohw_parallel_times, oc_parallel_times}}); \
- } \
- if (only_packA) { \
- ret_kern.push_back( \
- {kern_compute_onlypackA, \
- {GROUP, param.n, ohw_parallel_times, oc_parallel_times}}); \
- } \
- return ret_kern;
-
- #define COMPUTE_KERN(_name, _pack_mode, _dt, _post_ctype, _postprocess_mode) \
- auto kern_compute_##_name = [bundle, bundle_thread, matmul_param, \
- matmul_algo = m_matmul_algo, \
- ohw_tile_size = m_ohw_tile_size, \
- oc_tile_size = m_oc_tile_size]( \
- const NCBKernParam& param, \
- const NCBKernIndex& ncb_index) { \
- Im2colKerns<_pack_mode>::kerns<_dt, _dt, _dt, _post_ctype, \
- _post_ctype, _postprocess_mode>( \
- bundle, bundle_thread, param, matmul_param, matmul_algo, \
- ncb_index, ohw_tile_size, oc_tile_size); \
- };
-
- #define cb(_dt, _post_ctype, _postprocess_mode, _midout_tags) \
- do { \
- if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \
- MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1, _midout_tags) { \
- auto kern_padding = [bundle](const NCBKernParam& param, \
- const NCBKernIndex& ncb_index) { \
- copy_padding_kern<_dt>(bundle, param, ncb_index); \
- }; \
- auto kern_default_packA = \
- [bundle, matmul_algo = m_matmul_algo, matmul_param]( \
- const NCBKernParam& param, \
- const NCBKernIndex& ncb_index) { \
- Im2colKerns<Pack_Mode::DEFAULT>::packA_kern<_dt>( \
- bundle, param, matmul_param, matmul_algo, \
- ncb_index); \
- }; \
- auto kern_only_packA = [bundle, matmul_algo = m_matmul_algo, \
- matmul_param]( \
- const NCBKernParam& param, \
- const NCBKernIndex& \
- ncb_index) { \
- Im2colKerns<Pack_Mode::ONLY_PACKA>::packA_kern<_dt>( \
- bundle, param, matmul_param, matmul_algo, \
- ncb_index); \
- }; \
- COMPUTE_KERN(default, Pack_Mode::DEFAULT, _dt, _post_ctype, \
- _postprocess_mode); \
- COMPUTE_KERN(nopack, Pack_Mode::NO_PACK, _dt, _post_ctype, \
- _postprocess_mode); \
- COMPUTE_KERN(onlypackA, Pack_Mode::ONLY_PACKA, _dt, \
- _post_ctype, _postprocess_mode); \
- RETURN_KERNS(); \
- } \
- MIDOUT_END(); \
- return {}; \
- } \
- } while (0);
-
- cb(dt_float32, dt_float32, PostprocessMode::FLOAT, 0);
- #if !MEGDNN_DISABLE_FLOAT16
- cb(dt_float16, dt_float16, PostprocessMode::NO_PROCESS, 2);
- #endif
- #undef cb
- #undef COMPUTE_KERN
-
- #define COMPUTE_KERN(_name, _pack_mode, _src_ctype, _bias_ctype, _dst_ctype, \
- _i_bias_type, _i_dst_type, _postprocess_mode) \
- auto kern_compute_##_name = [bundle, bundle_thread, matmul_param, \
- matmul_algo = m_matmul_algo, \
- ohw_tile_size = m_ohw_tile_size, \
- oc_tile_size = m_oc_tile_size]( \
- const NCBKernParam& param, \
- const NCBKernIndex& ncb_index) { \
- Im2colKerns<_pack_mode>::kerns<_src_ctype, _bias_ctype, _dst_ctype, \
- DTypeTrait<_i_bias_type>::ctype, \
- DTypeTrait<_i_dst_type>::ctype, \
- _postprocess_mode>( \
- bundle, bundle_thread, param, matmul_param, matmul_algo, \
- ncb_index, ohw_tile_size, oc_tile_size); \
- };
-
- #define cb(_i_src_type, _i_bias_type, _i_dst_type, _src_ctype, _bias_ctype, \
- _dst_ctype, _postprocess_mode, _midout_tags) \
- do { \
- if (param.filter_type.enumv() == param.src_type.enumv() && \
- param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \
- param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \
- MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1, _midout_tags) { \
- auto kern_padding = [bundle](const NCBKernParam& param, \
- const NCBKernIndex& ncb_index) { \
- copy_padding_kern<_src_ctype>(bundle, param, ncb_index); \
- }; \
- auto kern_default_packA = [bundle, \
- matmul_algo = m_matmul_algo, \
- matmul_param]( \
- const NCBKernParam& param, \
- const NCBKernIndex& \
- ncb_index) { \
- Im2colKerns<Pack_Mode::DEFAULT>::packA_kern<_src_ctype>( \
- bundle, param, matmul_param, matmul_algo, \
- ncb_index); \
- }; \
- auto kern_only_packA = \
- [bundle, matmul_algo = m_matmul_algo, matmul_param]( \
- const NCBKernParam& param, \
- const NCBKernIndex& ncb_index) { \
- Im2colKerns<Pack_Mode::ONLY_PACKA>::packA_kern< \
- _src_ctype>(bundle, param, matmul_param, \
- matmul_algo, ncb_index); \
- }; \
- COMPUTE_KERN(default, Pack_Mode::DEFAULT, _src_ctype, \
- _bias_ctype, _dst_ctype, _i_bias_type, \
- _i_dst_type, _postprocess_mode); \
- COMPUTE_KERN(nopack, Pack_Mode::NO_PACK, _src_ctype, \
- _bias_ctype, _dst_ctype, _i_bias_type, \
- _i_dst_type, _postprocess_mode); \
- COMPUTE_KERN(onlypackA, Pack_Mode::ONLY_PACKA, _src_ctype, \
- _bias_ctype, _dst_ctype, _i_bias_type, \
- _i_dst_type, _postprocess_mode); \
- RETURN_KERNS(); \
- } \
- MIDOUT_END(); \
- return {}; \
- } \
- } while (0);
-
- cb(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32,
- PostprocessMode::NO_PROCESS, 3);
-
- cb(dt_int8, dt_int16, dt_int16, dt_int8, dt_int16, dt_int16,
- PostprocessMode::NO_PROCESS, 4);
-
- cb(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS32,
- dt_int8, dt_int32, dt_int32, PostprocessMode::NO_PROCESS, 7);
-
- cb(dtype::QuantizedS8, dtype::QuantizedS32, dtype::QuantizedS8, dt_int8,
- dt_int32, dt_int8, PostprocessMode::QUANTIZED, 8);
- #undef COMPUTE_KERN
- #undef RETURN_KERNS
- #undef cb
- megdnn_throw("unsupported data type on im2col matmul algo");
- }
- MIDOUT_END();
- return {};
- }
-
- bool ConvBiasImpl::AlgoIm2col::usable(
- ConvBiasImpl* opr, const NCBKernSizeParam& param,
- AlgoSelectionStrategy /*algo_selection_strategy*/) const {
- MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 2) {
- //! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode is
- //! identity otherwise return false mean that 8x8x32 and 8x8x16 not support
- //! PostProcess
- if (param.src_type.enumv() == param.filter_type.enumv() &&
- ((param.src_type.enumv() == DTypeEnum::Int8 &&
- (param.dst_type.enumv() == DTypeEnum::Int16 ||
- param.dst_type.enumv() == DTypeEnum::Int32)) ||
- ((param.src_type.enumv() == DTypeEnum::QuantizedS8 ||
- param.src_type.enumv() == DTypeEnum::Quantized8Asymm) &&
- param.dst_type.enumv() == DTypeEnum::QuantizedS32)) &&
- param.bias_mode != megdnn::BiasMode::NO_BIAS &&
- param.nonlineMode != megdnn::NonlineMode::IDENTITY) {
- return false;
- }
- fallback::MatrixMulImpl::KernSizeParam matmul_param =
- get_matmul_kern_param(param, m_ohw_tile_size, m_oc_tile_size);
- bool matmulusable = m_matmul_algo->usable(matmul_param);
- return matmulusable &&
- (opr->param().format == param::ConvBias::Format::NCHW) &&
- (param.filter_meta.spatial[0] == param.filter_meta.spatial[1] &&
- (param.filter_meta.spatial[0] <= 7)) &&
- (param.filter_meta.dilation[0] ==
- param.filter_meta.dilation[1] &&
- param.filter_meta.dilation[0] == 1) &&
- param.compute_mode == param::ConvBias::ComputeMode::DEFAULT;
- }
- MIDOUT_END();
- return false;
- }
-
- // vim: syntax=cpp.doxygen
|