Browse Source

fix(mgb/fallback): delete im2col duplicate code and fix nchw44 usable

GitOrigin-RevId: 1aa250e9e7
tags/v1.0.0-rc1
Megvii Engine Team Xinran Xu 4 years ago
parent
commit
df356635b7
7 changed files with 459 additions and 495 deletions
  1. +20
    -18
      dnn/src/arm_common/conv_bias/postprocess_helper.h
  2. +69
    -458
      dnn/src/fallback/conv_bias/im2col/algos.cpp
  3. +4
    -12
      dnn/src/fallback/conv_bias/im2col/factory.h
  4. +364
    -0
      dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
  5. +1
    -2
      dnn/src/fallback/conv_bias/im2col/strategy_default.cpp
  6. +1
    -2
      dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp
  7. +0
    -3
      dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp

+ 20
- 18
dnn/src/arm_common/conv_bias/postprocess_helper.h View File

@@ -100,6 +100,7 @@ namespace {
MIDOUT_END(); \
break; \
default: \
megdnn_throw("unknow biasmode"); \
break; \
}

@@ -282,24 +283,25 @@ struct PostProcess<opctype, opdtype, megdnn::PostprocessMode::QUANTIZED> {
reinterpret_cast<ctype*>(dst_ptr), bias_type, bias_type, \
dst_type, N* OC* OH* OW* pack_oc_size);

#define FOR_BIAS(_bias_mode, OH, OW) \
switch (_bias_mode) { \
case megdnn::BiasMode::NO_BIAS: \
break; \
case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS: \
if (pack_oc_size == 1) { \
FOR_BINARY_BROADCAST(CONCAT_OP(AddOp)); \
} else { \
megdnn_assert(pack_oc_size == 4, \
"Only support nchw44 in ARM"); \
FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp)); \
} \
break; \
case megdnn::BiasMode::BIAS: \
FOR_BINARY(CONCAT_OP(AddOp)); \
break; \
default: \
break; \
#define FOR_BIAS(_bias_mode, OH, OW) \
switch (_bias_mode) { \
case megdnn::BiasMode::NO_BIAS: \
break; \
case megdnn::BiasMode::BROADCAST_CHANNEL_BIAS: \
if (pack_oc_size == 1) { \
FOR_BINARY_BROADCAST(CONCAT_OP(AddOp)); \
} else { \
megdnn_assert(pack_oc_size == 4, \
"Only support nchw44 in ARM"); \
FOR_BINARY_BROADCAST_NCHW44(CONCAT_OP(AddOp)); \
} \
break; \
case megdnn::BiasMode::BIAS: \
FOR_BINARY(CONCAT_OP(AddOp)); \
break; \
default: \
megdnn_throw("unknow biasmode"); \
break; \
}

template <typename ctype, typename dtype>


+ 69
- 458
dnn/src/fallback/conv_bias/im2col/algos.cpp View File

@@ -10,6 +10,7 @@
*/

#include "src/fallback/conv_bias/im2col/algos.h"
#include "src/fallback/conv_bias/im2col/im2col_kerns.h"
#include "src/fallback/conv_bias/im2col/factory.h"
#include "megdnn/opr_param_defs.h"
#include "src/common/opr_delegate.h"
@@ -25,278 +26,6 @@ using namespace megdnn;
using namespace fallback;
using namespace im2col;

/*======================== AlgoIm2col=======================*/
/*!
* *\brief The index of all parts workspace in im2col workspace bundel
* *Through witch can convenient get the needed ptr
*/
struct Im2colBundelIndex {
static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
};

using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
/*!
* *\brief Im2colKerns collects all the im2col kerns in it
*/

template <Pack_Mode packmode>
class Im2colKerns;

template <>
class Im2colKerns<Pack_Mode::DEFAULT> {
public:
//! conv kernel
static void kerns(
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index,
size_t ohw_tile_size, StrategyBase* im2colstrategy) {
size_t OC = param.filter_meta.ocpg;
size_t output_block_size = std::min(
ohw_tile_size,
strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
size_t output_block_oc_size = std::min(
strategyparam.oc_tile_size,
OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);

strategyparam.batch_id = ncb_index.ndrange_id[0];
strategyparam.group_id = ncb_index.ndrange_id[1];
strategyparam.oc_cur_index =
ncb_index.ndrange_id[3] *
strategyparam.oc_tile_size;
strategyparam.oc_end_index = strategyparam.oc_cur_index +
output_block_oc_size;
strategyparam.ohw_cur_index =
ncb_index.ndrange_id[2] * ohw_tile_size;
strategyparam.output_block_oc_size = output_block_oc_size;
strategyparam.output_block_size = output_block_size;

bundle_thread.set(
static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
bundle_thread.total_size_in_bytes() * ncb_index.thread_id);
fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmul_kernsize_param;

//! 1.Im2col
im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
matmul_param, matmul_algo);

//! 2.packb and matmul compute
im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
matmul_param, matmul_algo, ncb_index,
matmul_desc);

//! 3.postprocess and copy dst if need
im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
}

WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
size_t pack_oc_size = pack_size(param.filter_meta.format);
size_t im2col = 0, packb = 0, bias_temp = 0;
bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT;
megdnn_assert(default_pack, "only support default packa");
size_t im2col_dst_size =
IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size *
sizeof(param.bias_type);
//! matmul_dst and im2col_dst use the same memory
WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
packb = wb.get_size(1);
im2col = std::max(im2col_dst_size, matmul_dst_size);
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
}
return {nullptr, {packb, im2col, bias_temp}};
}
};

template <>
class Im2colKerns<Pack_Mode::ONLY_PACKA> {
public:
//! conv kernel
static void kerns(
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index,
size_t ohw_tile_size, StrategyBase* im2colstrategy) {
size_t OC = param.filter_meta.ocpg;
size_t output_block_size = std::min(
ohw_tile_size,
strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
size_t output_block_oc_size = std::min(
strategyparam.oc_tile_size,
OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);

bundle_thread.set(
static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
bundle_thread.total_size_in_bytes() * ncb_index.thread_id);

fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmul_kernsize_param;

strategyparam.batch_id = ncb_index.ndrange_id[0];
strategyparam.group_id = ncb_index.ndrange_id[1];
strategyparam.oc_cur_index =
ncb_index.ndrange_id[3] *
strategyparam.oc_tile_size;
strategyparam.oc_end_index = strategyparam.oc_cur_index +
output_block_oc_size;
strategyparam.ohw_cur_index =
ncb_index.ndrange_id[2] * ohw_tile_size;
strategyparam.output_block_oc_size = output_block_oc_size;
strategyparam.output_block_size = output_block_size;

//! 1.Im2col
im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
matmul_param, matmul_algo);

//! 2.packb and matmul compute
im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
matmul_param, matmul_algo, ncb_index,
matmul_desc);

//! 3.postprocess and copy dst if need
im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
}
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];

size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0;
bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
megdnn_assert(only_packA, "onlysupport onlypackA mode");
size_t im2col_dst_size =
IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
size_t matmul_dst_size =
oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
//! matmul_dst and im2col_dst use the same memory
WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
packb = wb.get_size(1);
im2col = im2col_dst_size;
matmul_dst = matmul_dst_size;
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
}

return {nullptr, {packb, im2col, matmul_dst, bias_temp}};
}
};

template <>
class Im2colKerns<Pack_Mode::NO_PACK> {
public:
//! conv kernel
static void kerns(
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription&
matmul_desc,
StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index,
size_t ohw_tile_size, StrategyBase* im2colstrategy) {
size_t OC = param.filter_meta.ocpg;
size_t output_block_size = std::min(
ohw_tile_size,
strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
size_t output_block_oc_size = std::min(
strategyparam.oc_tile_size,
OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);

strategyparam.batch_id = ncb_index.ndrange_id[0];
strategyparam.group_id = ncb_index.ndrange_id[1];
strategyparam.oc_cur_index =
ncb_index.ndrange_id[3] *
strategyparam.oc_tile_size;
strategyparam.oc_end_index = strategyparam.oc_cur_index +
output_block_oc_size;
strategyparam.ohw_cur_index =
ncb_index.ndrange_id[2] * ohw_tile_size;
strategyparam.output_block_oc_size = output_block_oc_size;
strategyparam.output_block_size = output_block_size;

bundle_thread.set(
static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
bundle_thread.total_size_in_bytes() * ncb_index.thread_id);

fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmul_kernsize_param;

//! 1.Im2col
im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
matmul_param, matmul_algo);

//! 2.packb and matmul compute
im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
matmul_param, matmul_algo, ncb_index,
matmul_desc);

//! 3.postprocess and copy dst if need
im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
}
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
size_t ohw = param.osz[0] * param.osz[1];

size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0;
bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK;
megdnn_assert(no_pack, "only support no pack");
bool is_dst_8bit =
(param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
size_t im2col_dst_size =
IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
size_t matmul_dst_size =
oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
im2col = im2col_dst_size;
if (is_dst_8bit) {
matmul_dst = matmul_dst_size;
} else {
matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size;
}
matmul_compute = matmul_algo->get_workspace(im2col_kern_param);
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
}

return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}};
}
};

namespace {
static fallback::MatrixMulImpl::KernSizeParam get_matmul_kern_param(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
@@ -451,7 +180,6 @@ static WorkspaceBundle get_bundle(
MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_tile_size,
size_t ohw_tile_size) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(OC);
MEGDNN_MARK_USED_VAR(OH);
MEGDNN_MARK_USED_VAR(OW);
MEGDNN_MARK_USED_VAR(FH);
@@ -506,8 +234,9 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace(
m_matmul_algo->matmul_description();
size_t oc_tile_size = 0, ohw_tile_size = 0;
choice_ohw_oc_block(p, oc_tile_size, ohw_tile_size,
matmul_desc.innerblocksize.m, matmul_desc.innerblocksize.n,
m_ohw_tile_size, matmul_desc.packmode);
matmul_desc.innerblocksize.m,
matmul_desc.innerblocksize.n, m_ohw_tile_size,
matmul_desc.packmode);
return get_bundle(p, m_matmul_algo, oc_tile_size, ohw_tile_size)
.total_size_in_bytes();
}
@@ -518,20 +247,13 @@ size_t ConvBiasImpl::AlgoIm2col::get_workspace(
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 1) {
UNPACK_CONV_F32_NCB_KERN_SIZES(param);
MEGDNN_MARK_USED_VAR(SH);
MEGDNN_MARK_USED_VAR(SW);
MEGDNN_MARK_USED_VAR(IH);
MEGDNN_MARK_USED_VAR(IW);
MEGDNN_MARK_USED_VAR(FH);
MEGDNN_MARK_USED_VAR(FW);
size_t oc_tile_size = 0, ohw_tile_size = 0;
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t OC = param.filter_meta.ocpg;
size_t ohw = OH * OW;
size_t GROUP = param.filter_meta.group;
bool need_padding = (PH != 0 || PW != 0);
size_t oc_tile_size = 0, ohw_tile_size = 0;

fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
auto matmul_desc = m_matmul_algo->matmul_description();

bool default_pack = matmul_desc.packmode == Pack_Mode::DEFAULT;
bool no_pack = matmul_desc.packmode == Pack_Mode::NO_PACK;
@@ -542,12 +264,8 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
matmul_desc.innerblocksize.n, m_ohw_tile_size,
matmul_desc.packmode);

WorkspaceBundle bundle = get_bundle(param,m_matmul_algo,oc_tile_size,ohw_tile_size);
size_t ohw_parallel_times = div_ceil(ohw, ohw_tile_size);
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
size_t packa_parallel_times = 0;
size_t pack_oc_size = pack_size(param.filter_meta.format);

if (only_packA) {
packa_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
} else if (default_pack) {
@@ -558,9 +276,12 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
auto matmul_param = get_matmul_kern_param(
param, ohw_tile_size, default_pack ? OC : oc_tile_size);

WorkspaceBundle bundle =
get_bundle(param, m_matmul_algo, oc_tile_size, ohw_tile_size);
WorkspaceBundle bundle_thread =
get_thread_bundle(param, m_matmul_algo, matmul_param,
matmul_desc, oc_tile_size, ohw_tile_size);

StrategyParam strategyparam;
strategyparam.ohw = ohw;
strategyparam.is_dst_8bit =
@@ -578,138 +299,39 @@ SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoIm2col::dispatch_kerns(
m_matmul_algo, matmul_param, matmul_desc, packa_parallel_times);

SmallVector<ConvBiasImpl::NCBKern> ret_kern;
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv("ConvBiasImpl::AlgoIm2col::dispatch_kerns"_hash)) {
StrategyBase* im2colstrategy =
Factory::get_im2col_strategy(param, m_matmul_algo);
auto kern_padding = [bundle, im2colstrategy,
pack_oc_size = pack_oc_size](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
pack_oc_size);
};

auto kern_packA = [bundle, matmul_algo = m_matmul_algo,
matmul_param, im2colstrategy,
strategyparam = strategyparam,
matmul_desc = matmul_desc](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);

im2colstrategy->packA_kern(bundle, param, matmul_param,
matmul_algo, ncb_index, matmul_desc,
strategyparam);
};
if (default_pack) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::dispatch_kerns_default_pack"_hash)) {
auto kern_compute_default =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
ohw_tile_size = ohw_tile_size,
strategyparam = strategyparam,
matmul_desc = matmul_desc, im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::DEFAULT>::kerns(
bundle, bundle_thread, param,
matmul_param, matmul_algo, matmul_desc,
strategyparam, ncb_index, ohw_tile_size,
im2colstrategy);
};
if (!enable_filter_preprocess) {
ret_kern.push_back(
{kern_packA, {GROUP, packa_parallel_times}});
}
if (need_padding) {
ret_kern.push_back(
{kern_padding,
{param.n, GROUP, IC / pack_oc_size}});
}
ret_kern.push_back({kern_compute_default,
{N, GROUP, ohw_parallel_times,
oc_parallel_times}});
return ret_kern;
}
MIDOUT_END();
return {};
} else if (only_packA) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::dispatch_kerns_onlypacka"_hash)) {
auto kern_compute_onlypackA =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size,
matmul_desc = matmul_desc, im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::ONLY_PACKA>::kerns(
bundle, bundle_thread, param,
matmul_param, matmul_algo, matmul_desc,
strategyparam, ncb_index, ohw_tile_size,
im2colstrategy);
};
if (!enable_filter_preprocess) {
ret_kern.push_back(
{kern_packA, {GROUP, packa_parallel_times}});
}
if (need_padding) {
ret_kern.push_back(
{kern_padding, {param.n, GROUP, IC}});
}
ret_kern.push_back({kern_compute_onlypackA,
{N, GROUP, ohw_parallel_times,
oc_parallel_times}});
return ret_kern;
}
MIDOUT_END();
return {};
} else if (no_pack) {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::dispatch_kerns_no_pack"_hash)) {
auto kern_compute_nopack =
[bundle, bundle_thread, matmul_param,
matmul_algo = m_matmul_algo,
strategyparam = strategyparam,
ohw_tile_size = ohw_tile_size,
matmul_desc = matmul_desc, im2colstrategy](
const NCBKernParam& param,
const NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
Im2colKerns<Pack_Mode::NO_PACK>::kerns(
bundle, bundle_thread, param,
matmul_param, matmul_algo, matmul_desc,
strategyparam, ncb_index, ohw_tile_size,
im2colstrategy);
};
if (need_padding) {
ret_kern.push_back(
{kern_padding, {param.n, GROUP, IC}});
}
ret_kern.push_back({kern_compute_nopack,
{N, GROUP, ohw_parallel_times,
oc_parallel_times}});
return ret_kern;
}
MIDOUT_END();
return {};
StrategyBase* im2colstrategy =
Factory::get_im2col_strategy(param, m_matmul_algo);
if (default_pack) {
MIDOUT_BEGIN(megdnn_fallback_im2col,
midout_iv("dispatch_kerns_default_pack"_hash)) {
return Im2colKerns<Pack_Mode::DEFAULT>().get_kerns(
param, bundle, bundle_thread, strategyparam,
matmul_param, im2colstrategy, m_matmul_algo,
ohw_tile_size, oc_tile_size, pack_oc_size);
}
MIDOUT_END();
return {};
} else if (only_packA) {
MIDOUT_BEGIN(megdnn_fallback_im2col,
midout_iv("dispatch_kerns_onlypacka"_hash)) {
return Im2colKerns<Pack_Mode::ONLY_PACKA>().get_kerns(
param, bundle, bundle_thread, strategyparam,
matmul_param, im2colstrategy, m_matmul_algo,
ohw_tile_size, oc_tile_size, pack_oc_size);
}
MIDOUT_END();
return {};
} else if (no_pack) {
MIDOUT_BEGIN(megdnn_fallback_im2col,
midout_iv("dispatch_kerns_no_pack"_hash)) {
return Im2colKerns<Pack_Mode::NO_PACK>().get_kerns(
param, bundle, bundle_thread, strategyparam,
matmul_param, im2colstrategy, m_matmul_algo,
ohw_tile_size, oc_tile_size, pack_oc_size);
}
MIDOUT_END();
return {};
}
MIDOUT_END();
return {};
}
MIDOUT_END();
@@ -721,23 +343,38 @@ bool ConvBiasImpl::AlgoIm2col::usable(
AlgoSelectionStrategy /*algo_selection_strategy*/) const {
MIDOUT_BEGIN(megdnn_fallback_im2col, 0, 2) {
auto format = param.filter_meta.format;
auto matmul_desc = m_matmul_algo->matmul_description();
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
if (format != param::ConvBias::Format::NCHW &&
format != param::ConvBias::Format::NCHW44_DOT &&
format != param::ConvBias::Format::NCHW44) {
format != param::ConvBias::Format::NCHW44 &&
format != param::ConvBias::Format::NCHW44_DOT) {
return false;
}

if(param.src_type.enumv() != param.filter_type.enumv()) {
if (format == param::ConvBias::Format::NCHW44 ||
format == param::ConvBias::Format::NCHW44_DOT) {
//! current NCHW44 im2col only support DEFAULT mode matmul
if (matmul_desc.packmode != Pack_Mode::DEFAULT) {
return false;
//! nchw44 hybird mode and channel wise is not support
} else if (param.filter_meta.icpg < 4_z ||
param.filter_meta.icpg == 1 ||
param.filter_meta.ocpg == 1) {
return false;
}
}
#else
if (format != param::ConvBias::Format::NCHW) {
return false;
}

if (param.src_type.enumv() != DTypeEnum::Int8 &&
param.src_type.enumv() != DTypeEnum::QuantizedS8 &&
param.src_type.enumv() != DTypeEnum::Quantized8Asymm &&
#endif
if (param.src_type.enumv() != param.filter_type.enumv() ||
(param.src_type.enumv() != DTypeEnum::Int8 &&
param.src_type.enumv() != DTypeEnum::QuantizedS8 &&
param.src_type.enumv() != DTypeEnum::Quantized8Asymm &&
#if !MEGDNN_DISABLE_FLOAT16
param.src_type.enumv() != DTypeEnum::Float16 &&
param.src_type.enumv() != DTypeEnum::Float16 &&
#endif
param.src_type.enumv() != DTypeEnum::Float32) {
param.src_type.enumv() != DTypeEnum::Float32)) {
return false;
}
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode is
@@ -750,28 +387,6 @@ bool ConvBiasImpl::AlgoIm2col::usable(
return false;
}
}
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();
//! only matmul's packmode is packa or default support weight preprocess
if (is_enable_filter_preprocess(param) &&
(matmul_desc.packmode ==
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK)) {
return false;
}

if (format == param::ConvBias::Format::NCHW44 ||
format == param::ConvBias::Format::NCHW44_DOT) {
//! current NCHW44 im2col only support DEFAULT mode matmul
if (matmul_desc.packmode != Pack_Mode::DEFAULT) {
return false;
//! nchw44 hybird mode and channel wise is not support
} else if (param.filter_meta.icpg < 4_z ||
param.filter_meta.icpg == 1 ||
param.filter_meta.ocpg == 1) {
return false;
}
}

size_t oc_tile_size = 0, ohw_tile_size = 0;
choice_ohw_oc_block(param, oc_tile_size, ohw_tile_size,
matmul_desc.innerblocksize.m,
@@ -798,10 +413,8 @@ bool ConvBiasImpl::AlgoIm2col::usable(
SmallVector<TensorLayout>
ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout(
const NCBKernSizeParam& param) const {
MIDOUT_BEGIN(
megdnn_fallback_im2col,
midout_iv(
"ConvBiasImpl::AlgoIm2col::deduce_preprocessed_filter_layout"_hash)) {
MIDOUT_BEGIN(megdnn_fallback_im2col,
midout_iv("deduce_preprocessed_filter_layout"_hash)) {
fallback::MatrixMulImpl::AlgoBase::MatmulDescription matmul_desc =
m_matmul_algo->matmul_description();

@@ -863,8 +476,6 @@ ConvBiasImpl::AlgoIm2col::dispatch_preprocess_kerns(
packa_parallel_times =
div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
} else {
//! if nopack return null so that OprWeightPreprocessProxy can run
//! with nopack mode
return {};
}
auto matmul_param = get_matmul_kern_param(


+ 4
- 12
dnn/src/fallback/conv_bias/im2col/factory.h View File

@@ -26,11 +26,10 @@ enum class StrategyType : uint32_t {
FLOAT = 0,
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
FLOAT_FP16 = 1,
#else
#endif
#if !MEGDNN_DISABLE_FLOAT16
FLOAT16_FLOAT16 = 2,
#endif
#endif
INT8x8x32 = 3,
INT8x8x16 = 4,
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
@@ -153,12 +152,10 @@ public:
cb1(dt_float32, dt_float32, StrategyType::FLOAT);
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
cb1(dt_float16, __fp16, StrategyType::FLOAT_FP16);
#else
#endif
#if !MEGDNN_DISABLE_FLOAT16
cb1(dt_float16, dt_float16, StrategyType::FLOAT16_FLOAT16);
#endif
#endif

cb2(dt_int8, dt_int32, dt_int32, dt_int8, dt_int32, dt_int32,
StrategyType::INT8x8x32);

@@ -256,8 +253,7 @@ public:
!param.filter_meta.should_flip) {
MIDOUT_BEGIN(
megdnn_fallback_im2col_factory_make_strategy,
midout_iv(
"DefaultStrategyType::8x12x1_fuse_packb_s2_nchw44"_hash)) {
midout_iv("8x12x1_fuse_packb_s2_nchw44"_hash)) {
return std::make_unique<
StrategyFuseXx12x1Nchw44K3x3S2<
float, float,
@@ -284,7 +280,7 @@ public:
cb1(NCHW, DEFAULT, dt_float16, __fp16, PostprocessMode::FLOAT,
"DefaultStrategyType::FLOAT_FP16"_hash);
break;
#else
#endif
#if !MEGDNN_DISABLE_FLOAT16
case StrategyType::FLOAT16_FLOAT16:
cb1(NCHW, DEFAULT, dt_float16, dt_float16,
@@ -292,7 +288,6 @@ public:
"DefaultStrategyType::FLOAT16_FLOAT16"_hash);
break;
#endif
#endif
case StrategyType::INT8x8x32:
if (format == param::ConvBias::Format::NCHW) {
cb3(NCHW, DEFAULT, dt_int8, dt_int32, dt_int32, dt_int8,
@@ -472,8 +467,6 @@ public:
cb1(NCHW, NO_PACK, dt_float32, dt_float32,
PostprocessMode::FLOAT, "NoPackStrategyType::FLOAT"_hash);
break;
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#else
#if !MEGDNN_DISABLE_FLOAT16
case StrategyType::FLOAT16_FLOAT16:
cb1(NCHW, NO_PACK, dt_float16, dt_float16,
@@ -481,7 +474,6 @@ public:
"NoPackStrategyType::FLOAT16_FLOAT16"_hash);
break;
#endif
#endif
case StrategyType::INT8x8x16:
cb3(NCHW, NO_PACK, dt_int8, dt_int16, dt_int16, dt_int8,
dt_int16, dt_int16, PostprocessMode::ADD_BIAS,


+ 364
- 0
dnn/src/fallback/conv_bias/im2col/im2col_kerns.h View File

@@ -0,0 +1,364 @@
/**
* \file dnn/src/fallback/conv_bias/im2col/im2col_kerns.h
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
*
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*/

#include "src/fallback/conv_bias/opr_impl.h"
#include "src/naive/convolution/helper.h"
#include "src/fallback/conv_bias/im2col/factory.h"

#include "midout.h"

MIDOUT_DECL(megdnn_fallback_im2col)

namespace megdnn {
namespace fallback {
namespace im2col {

/*!
* *\brief The index of all parts workspace in im2col workspace bundel
* *Through witch can convenient get the needed ptr
*/
struct Im2colBundelIndex {
static constexpr size_t BUNDLE_THREAD_INDEX = 2_z;
};

using Pack_Mode=fallback::MatrixMulImpl::AlgoBase::PackMode;
/*!
* *\brief Im2colKerns collects all the im2col kerns in it
*/
namespace{
//! conv kernel
static void kerns(
const WorkspaceBundle& bundle, WorkspaceBundle bundle_thread,
const ConvBiasImpl::NCBKernParam& param,
fallback::MatrixMulImpl::KernSizeParam matmul_kernsize_param,
const fallback::MatrixMulImpl::AlgoBase* matmul_algo,
const fallback::MatrixMulImpl::AlgoBase::MatmulDescription& matmul_desc,
StrategyParam strategyparam,
fallback::ConvBiasImpl::NCBKernIndex ncb_index, size_t ohw_tile_size,
StrategyBase* im2colstrategy) {
size_t OC = param.filter_meta.ocpg;
size_t output_block_size = std::min(
ohw_tile_size,
strategyparam.ohw - ncb_index.ndrange_id[2] * ohw_tile_size);
size_t output_block_oc_size =
std::min(strategyparam.oc_tile_size,
OC - ncb_index.ndrange_id[3] * strategyparam.oc_tile_size);

bundle_thread.set(
static_cast<int8_t*>(
bundle.get(Im2colBundelIndex::BUNDLE_THREAD_INDEX)) +
bundle_thread.total_size_in_bytes() * ncb_index.thread_id);

fallback::MatrixMulImpl::KernParam matmul_param;
static_cast<fallback::MatrixMulImpl::KernSizeParam&>(matmul_param) =
matmul_kernsize_param;

strategyparam.batch_id = ncb_index.ndrange_id[0];
strategyparam.group_id = ncb_index.ndrange_id[1];
strategyparam.oc_cur_index =
ncb_index.ndrange_id[3] * strategyparam.oc_tile_size;
strategyparam.oc_end_index =
strategyparam.oc_cur_index + output_block_oc_size;
strategyparam.ohw_cur_index = ncb_index.ndrange_id[2] * ohw_tile_size;
strategyparam.output_block_oc_size = output_block_oc_size;
strategyparam.output_block_size = output_block_size;

//! 1.Im2col
im2colstrategy->exec_im2col(bundle, bundle_thread, strategyparam, param,
matmul_param, matmul_algo);

//! 2.packb and matmul compute
im2colstrategy->exec_matmul(param, strategyparam, bundle, bundle_thread,
matmul_param, matmul_algo, ncb_index,
matmul_desc);

//! 3.postprocess and copy dst if need
im2colstrategy->exec_postprocess(param, strategyparam, bundle_thread);
}
} // namespace

template <Pack_Mode packmode>
class Im2colKerns;

template <>
class Im2colKerns<Pack_Mode::DEFAULT> {
public:
SmallVector<ConvBiasImpl::NCBKern> get_kerns(
const ConvBiasImpl::NCBKernSizeParam& param,
WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
const StrategyParam& strategyparam,
fallback::MatrixMulImpl::KernSizeParam& matmul_param,
StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
auto matmul_desc = matmul_algo->matmul_description();
auto kern_padding =
[bundle, im2colstrategy, pack_oc_size = pack_oc_size](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
pack_oc_size);
};

auto kern_packA =
[bundle, matmul_algo, matmul_param, im2colstrategy,
strategyparam = strategyparam, matmul_desc = matmul_desc](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->packA_kern(bundle, param, matmul_param,
matmul_algo, ncb_index,
matmul_desc, strategyparam);
};
auto kern_compute_default =
[bundle, bundle_thread, matmul_param, matmul_algo,
ohw_tile_size, strategyparam, matmul_desc = matmul_desc,
im2colstrategy](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
kerns(bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam, ncb_index,
ohw_tile_size, im2colstrategy);
};
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t BATCH = param.n;
size_t OC = param.filter_meta.ocpg;
size_t IC = param.filter_meta.icpg;
size_t PH = param.filter_meta.padding[0];
size_t PW = param.filter_meta.padding[1];
size_t GROUP = param.filter_meta.group;
size_t packa_parallel_times =
div_ceil<size_t>(OC, matmul_desc.innerblocksize.m);
size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
SmallVector<ConvBiasImpl::NCBKern> ret_kern;
if (!is_enable_filter_preprocess(param)) {
ret_kern.push_back({kern_packA, {GROUP, packa_parallel_times}});
}
if (PH != 0 || PW != 0) {
ret_kern.push_back(
{kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
}
ret_kern.push_back(
{kern_compute_default,
{BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
return ret_kern;
}

WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
size_t pack_oc_size = pack_size(param.filter_meta.format);
size_t im2col = 0, packb = 0, bias_temp = 0;
bool default_pack = matmul_algo->packmode() == Pack_Mode::DEFAULT;
megdnn_assert(default_pack, "only support default packa");
size_t im2col_dst_size =
IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
size_t matmul_dst_size = pack_oc_size * oc_tile_size * ohw_tile_size *
sizeof(param.bias_type);
//! matmul_dst and im2col_dst use the same memory
WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
packb = wb.get_size(1);
im2col = std::max(im2col_dst_size, matmul_dst_size);
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
}
return {nullptr, {packb, im2col, bias_temp}};
}
};

template <>
class Im2colKerns<Pack_Mode::ONLY_PACKA> {
public:
SmallVector<ConvBiasImpl::NCBKern> get_kerns(
const ConvBiasImpl::NCBKernSizeParam& param,
WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
const StrategyParam& strategyparam,
fallback::MatrixMulImpl::KernSizeParam& matmul_param,
StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
auto matmul_desc = matmul_algo->matmul_description();
auto kern_padding =
[bundle, im2colstrategy, pack_oc_size = pack_oc_size](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
pack_oc_size);
};

auto kern_packA =
[bundle, matmul_algo, matmul_param, im2colstrategy,
strategyparam = strategyparam, matmul_desc = matmul_desc](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->packA_kern(bundle, param, matmul_param,
matmul_algo, ncb_index,
matmul_desc, strategyparam);
};
auto kern_compute_onlypackA =
[bundle, bundle_thread, matmul_param, matmul_algo,
strategyparam, ohw_tile_size, matmul_desc, im2colstrategy](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
kerns(bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam, ncb_index,
ohw_tile_size, im2colstrategy);
};
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t BATCH = param.n;
size_t OC = param.filter_meta.ocpg;
size_t IC = param.filter_meta.icpg;
size_t PH = param.filter_meta.padding[0];
size_t PW = param.filter_meta.padding[1];
size_t GROUP = param.filter_meta.group;
size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
SmallVector<ConvBiasImpl::NCBKern> ret_kern;
if (!is_enable_filter_preprocess(param)) {
ret_kern.push_back({kern_packA, {GROUP, oc_parallel_times}});
}
if (PH != 0 || PW != 0) {
ret_kern.push_back(
{kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
}
ret_kern.push_back(
{kern_compute_onlypackA,
{BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
return ret_kern;
}
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];

size_t im2col = 0, packb = 0, matmul_dst = 0, bias_temp = 0;
bool only_packA = matmul_algo->packmode() == Pack_Mode::ONLY_PACKA;
megdnn_assert(only_packA, "onlysupport onlypackA mode");
size_t im2col_dst_size =
IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
size_t matmul_dst_size =
oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
//! matmul_dst and im2col_dst use the same memory
WorkspaceBundle wb = matmul_algo->get_bundle(im2col_kern_param);
packb = wb.get_size(1);
im2col = im2col_dst_size;
matmul_dst = matmul_dst_size;
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
}

return {nullptr, {packb, im2col, matmul_dst, bias_temp}};
}
};

template <>
class Im2colKerns<Pack_Mode::NO_PACK> {
public:
SmallVector<ConvBiasImpl::NCBKern> get_kerns(
const ConvBiasImpl::NCBKernSizeParam& param,
WorkspaceBundle& bundle, WorkspaceBundle& bundle_thread,
const StrategyParam& strategyparam,
fallback::MatrixMulImpl::KernSizeParam& matmul_param,
StrategyBase* im2colstrategy, MatrixMulImpl::AlgoBase* matmul_algo,
size_t ohw_tile_size, size_t oc_tile_size, size_t pack_oc_size) {
auto matmul_desc = matmul_algo->matmul_description();
auto kern_padding =
[bundle, im2colstrategy, pack_oc_size = pack_oc_size](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
im2colstrategy->copy_padding_kern(bundle, param, ncb_index,
pack_oc_size);
};
auto kern_compute_nopack =
[bundle, bundle_thread, matmul_param, matmul_algo,
strategyparam, ohw_tile_size, matmul_desc, im2colstrategy](
const ConvBiasImpl::NCBKernParam& param,
const ConvBiasImpl::NCBKernIndex& ncb_index) mutable {
bundle.set(param.workspace_ptr);
kerns(bundle, bundle_thread, param, matmul_param,
matmul_algo, matmul_desc, strategyparam, ncb_index,
ohw_tile_size, im2colstrategy);
};
size_t OH = param.osz[0];
size_t OW = param.osz[1];
size_t BATCH = param.n;
size_t OC = param.filter_meta.ocpg;
size_t IC = param.filter_meta.icpg;
size_t PH = param.filter_meta.padding[0];
size_t PW = param.filter_meta.padding[1];
size_t GROUP = param.filter_meta.group;
size_t ohw_parallel_times = div_ceil(OH * OW, ohw_tile_size);
size_t oc_parallel_times = div_ceil<size_t>(OC, oc_tile_size);
SmallVector<ConvBiasImpl::NCBKern> ret_kern;
if (PH != 0 || PW != 0) {
ret_kern.push_back(
{kern_padding, {BATCH, GROUP, IC / pack_oc_size}});
}
ret_kern.push_back(
{kern_compute_nopack,
{BATCH, GROUP, ohw_parallel_times, oc_parallel_times}});
return ret_kern;
}
WorkspaceBundle get_thread_bundle(
const fallback::ConvBiasImpl::NCBKernSizeParam& param,
const fallback::MatrixMulImpl::KernSizeParam& im2col_kern_param,
const MatrixMulImpl::AlgoBase* matmul_algo, size_t ohw_tile_size,
size_t oc_tile_size) {
size_t IC = param.filter_meta.icpg, FH = param.filter_meta.spatial[0],
FW = param.filter_meta.spatial[1];
size_t ohw = param.osz[0] * param.osz[1];

size_t im2col = 0, matmul_dst = 0, bias_temp = 0, matmul_compute = 0;
bool no_pack = matmul_algo->packmode() == Pack_Mode::NO_PACK;
megdnn_assert(no_pack, "only support no pack");
bool is_dst_8bit =
(param.src_type.enumv() == DTypeEnum::QuantizedS8 &&
param.dst_type.enumv() == DTypeEnum::QuantizedS8) ||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm &&
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm);
size_t im2col_dst_size =
IC * FH * FW * ohw_tile_size * sizeof(param.src_type);
size_t matmul_dst_size =
oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
im2col = im2col_dst_size;
if (is_dst_8bit) {
matmul_dst = matmul_dst_size;
} else {
matmul_dst = ohw_tile_size >= ohw ? 0 : matmul_dst_size;
}
matmul_compute = matmul_algo->get_workspace(im2col_kern_param);
if (param.bias_mode == megdnn::BiasMode::BIAS) {
bias_temp = oc_tile_size * ohw_tile_size * sizeof(param.bias_type);
}

return {nullptr, {im2col, matmul_dst, bias_temp, matmul_compute}};
}
};

} // namespace im2col
} // namespace fallback
} // namespace megdnn

// vim: syntax=cpp.doxygen

+ 1
- 2
dnn/src/fallback/conv_bias/im2col/strategy_default.cpp View File

@@ -192,12 +192,11 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
megdnn::PostprocessMode::FLOAT)
#else
#endif
#if !MEGDNN_DISABLE_FLOAT16
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
megdnn::PostprocessMode::NO_PROCESS)
#endif
#endif

#if MEGDNN_AARCH64 || MEGDNN_ARMV7
//! x86 do not have uint8 matmul so only armv7 armv8 support uint8


+ 1
- 2
dnn/src/fallback/conv_bias/im2col/strategy_default_nchw44.cpp View File

@@ -108,13 +108,12 @@ INSTANTIAL_CLASS(dt_float32, dt_float32, dt_float32, dt_float32, dt_float32,
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, __fp16, __fp16,
megdnn::PostprocessMode::FLOAT)
#else
#endif
#if !MEGDNN_DISABLE_FLOAT16
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
megdnn::PostprocessMode::NO_PROCESS)
#endif

#endif
#if MEGDNN_AARCH64 || MEGDNN_ARMV7
//! x86 do not have uint8 matmul so only armv7 armv8 support uint8
INSTANTIAL_CLASS(dt_uint8, dt_int32, dt_uint8, dt_qint32, dt_quint8,


+ 0
- 3
dnn/src/fallback/conv_bias/im2col/strategy_nopack.cpp View File

@@ -165,13 +165,10 @@ INSTANTIAL_CLASS(dt_int8, dt_int16, dt_int16, dt_int16, dt_int16,
megdnn::PostprocessMode::ADD_BIAS)
INSTANTIAL_CLASS(dt_int8, dt_int32, dt_int32, dt_int32, dt_int32,
megdnn::PostprocessMode::ADD_BIAS)
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
#else
#if !MEGDNN_DISABLE_FLOAT16
INSTANTIAL_CLASS(dt_float16, dt_float16, dt_float16, dt_float16, dt_float16,
megdnn::PostprocessMode::NO_PROCESS)
#endif
#endif
#undef INSTANTIAL_CLASS
} // namespace megdnn



Loading…
Cancel
Save