@@ -0,0 +1,230 @@ | |||||
/** | |||||
* \file dnn/src/fallback/conv_bias/conv1x1/algos.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/fallback/conv_bias/conv1x1/algos.h" | |||||
#include "src/common/opr_delegate.h" | |||||
#include "src/fallback/conv_bias/common.h" | |||||
#include "src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h" | |||||
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||||
#include "src/fallback/conv_bias/opr_impl.h" | |||||
#include "megdnn/opr_param_defs.h" | |||||
#include "src/naive/convolution/helper.h" | |||||
#if MEGDNN_X86 | |||||
#include "src/x86/conv_bias/postprocess_helper.h" | |||||
#endif | |||||
#include "midout.h" | |||||
MIDOUT_DECL(megdnn_fallback_conv1x1) | |||||
using namespace megdnn; | |||||
using namespace fallback; | |||||
#if MEGDNN_X86 | |||||
using namespace x86; | |||||
#endif | |||||
using namespace conv1x1; | |||||
size_t ConvBiasImpl::AlgoConv1x1::get_oc_tile_size_heuristic( | |||||
const NCBKernSizeParam& param) const { | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
if (OH * OW >= 56 * 56 || OC >= 64) | |||||
return m_oc_block_size; | |||||
return div_ceil(OC, param.nr_threads); | |||||
} | |||||
size_t ConvBiasImpl::AlgoConv1x1::get_workspace( | |||||
ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||||
auto matmul_param = | |||||
get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | |||||
auto pack_mode = m_matmul_algo->packmode(); | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 0) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
return dispatcher | |||||
.get_bundle(param, matmul_param, m_matmul_algo, | |||||
compt_oc_block_size) | |||||
.total_size_in_bytes(); | |||||
} | |||||
MIDOUT_END(); | |||||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 1) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher; | |||||
return dispatcher | |||||
.get_bundle(param, matmul_param, m_matmul_algo, | |||||
compt_oc_block_size) | |||||
.total_size_in_bytes(); | |||||
} | |||||
MIDOUT_END(); | |||||
} else { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 0, 2) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||||
return dispatcher | |||||
.get_bundle(param, matmul_param, m_matmul_algo, | |||||
compt_oc_block_size) | |||||
.total_size_in_bytes(); | |||||
} | |||||
MIDOUT_END(); | |||||
} | |||||
return 0; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoConv1x1::dispatch_kerns( | |||||
ConvBiasImpl* opr, const NCBKernSizeParam& param) const { | |||||
SmallVector<ConvBiasImpl::NCBKern> ret_kern; | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t compt_oc_block_size = get_oc_tile_size_heuristic(param); | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t BATCH = param.n; | |||||
size_t oc_blocks_per_group = div_ceil(OC, compt_oc_block_size); | |||||
auto matmul_param = | |||||
get_matmul_kern_param(param, OH * OW, compt_oc_block_size); | |||||
WorkspaceBundle whole_bundle = {nullptr, {}}; | |||||
WorkspaceBundle thread_bundle = {nullptr, {}}; | |||||
WorkspaceBundle matmul_bundle = {nullptr, {}}; | |||||
auto pack_mode = m_matmul_algo->packmode(); | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 0) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::DEFAULT> dispatcher; | |||||
whole_bundle = dispatcher.get_bundle( | |||||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
} | |||||
MIDOUT_END(); | |||||
} else if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 1) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA> dispatcher; | |||||
whole_bundle = dispatcher.get_bundle( | |||||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
matmul_bundle = m_matmul_algo->get_bundle(matmul_param); | |||||
} | |||||
MIDOUT_END(); | |||||
} else { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 1, 2) { | |||||
Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> dispatcher; | |||||
whole_bundle = dispatcher.get_bundle( | |||||
param, matmul_param, m_matmul_algo, compt_oc_block_size); | |||||
matmul_bundle = { | |||||
nullptr, | |||||
{0, 0, m_matmul_algo->get_workspace(matmul_param)}}; | |||||
} | |||||
MIDOUT_END(); | |||||
} | |||||
//! get thread bundle | |||||
thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2), | |||||
compt_oc_block_size); | |||||
Conv1x1StrategyBase* conv1x1_strategy = | |||||
Conv1x1Factory::make_conv1x1_strategy(param, pack_mode, | |||||
opr->param().format); | |||||
auto kern_packA = [this, whole_bundle, matmul_bundle, param, | |||||
compt_oc_block_size, conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packA(whole_bundle, matmul_bundle, | |||||
compt_oc_block_size, this->m_matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
auto kern_packB = [this, whole_bundle, matmul_bundle, param, | |||||
conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->packB(whole_bundle, matmul_bundle, | |||||
this->m_matmul_algo, param, ncb_param, | |||||
std::move(ncb_index)); | |||||
}; | |||||
auto kern_compt = [this, whole_bundle, matmul_bundle, thread_bundle, param, | |||||
compt_oc_block_size, conv1x1_strategy]( | |||||
const NCBKernParam& ncb_param, | |||||
const NCBKernIndex& ncb_index) mutable { | |||||
conv1x1_strategy->exec(whole_bundle, matmul_bundle, thread_bundle, | |||||
compt_oc_block_size, this->m_matmul_algo, param, | |||||
ncb_param, std::move(ncb_index)); | |||||
}; | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT || | |||||
pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) { | |||||
ret_kern.push_back({kern_packA, {GROUP, oc_blocks_per_group}}); | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
ret_kern.push_back({kern_packB, {1}}); | |||||
} | |||||
} | |||||
ret_kern.push_back({kern_compt, {BATCH, GROUP, oc_blocks_per_group}}); | |||||
return ret_kern; | |||||
} | |||||
bool ConvBiasImpl::AlgoConv1x1::usable(ConvBiasImpl* opr, | |||||
const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy) const { | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1, 0, 2) { | |||||
//! only support nchw format | |||||
if (opr->param().format != param::ConvBias::Format::NCHW) | |||||
return false; | |||||
size_t FH = param.filter_meta.spatial[0], | |||||
FW = param.filter_meta.spatial[1]; | |||||
size_t PH = param.filter_meta.padding[0], | |||||
PW = param.filter_meta.padding[1]; | |||||
size_t SH = param.filter_meta.stride[0], | |||||
SW = param.filter_meta.stride[1]; | |||||
if (FH != 1 || FW != 1 || PH || PW || SH != 1 || SW != 1) | |||||
return false; | |||||
//! make sure 8x8x16 and 8x8x32 biasmode is nobias and nonlineMode | |||||
//! is identity otherwise return false mean that 8x8x32 and 8x8x16 | |||||
//! not support PostProcess | |||||
if (param.src_type.enumv() == param.filter_type.enumv() && | |||||
(param.src_type.enumv() == DTypeEnum::Int8 && | |||||
(param.dst_type.enumv() == DTypeEnum::Int16 || | |||||
param.dst_type.enumv() == DTypeEnum::Int32)) && | |||||
param.bias_mode != megdnn::BiasMode::NO_BIAS && | |||||
param.nonlineMode != megdnn::NonlineMode::IDENTITY) | |||||
return false; | |||||
if (param.src_type.enumv() == param.filter_type.enumv() && | |||||
((param.src_type.enumv() == DTypeEnum::QuantizedS8 || | |||||
param.src_type.enumv() == DTypeEnum::Quantized8Asymm) && | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS32) && | |||||
param.bias_mode != megdnn::BiasMode::NO_BIAS && | |||||
param.nonlineMode != megdnn::NonlineMode::IDENTITY) | |||||
return false; | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
MatrixMulImpl::KernSizeParam matmul_param = | |||||
get_matmul_kern_param(param, OH * OW, get_oc_tile_size_heuristic(param)); | |||||
bool matmulusable = m_matmul_algo->usable(matmul_param); | |||||
return matmulusable && | |||||
(param.filter_meta.dilation[0] == | |||||
param.filter_meta.dilation[1] && | |||||
param.filter_meta.dilation[0] == 1) && | |||||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT; | |||||
} | |||||
MIDOUT_END(); | |||||
return false; | |||||
} |
@@ -0,0 +1,56 @@ | |||||
/** | |||||
* \file dnn/src/fallback/conv_bias/conv1x1/algos.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include "megdnn/thin/small_vector.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/fallback/conv_bias/opr_impl.h" | |||||
#include "src/fallback/matrix_mul/opr_impl.h" | |||||
namespace megdnn { | |||||
namespace fallback { | |||||
class ConvBiasImpl::AlgoConv1x1 final : public AlgoBase { | |||||
public: | |||||
AlgoConv1x1(MatrixMulImpl::AlgoBase* matmul_algo, size_t oc_block_size) | |||||
: m_matmul_algo(matmul_algo), m_oc_block_size(oc_block_size) {} | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ssprintf("CONV1x1:%s:%zu", m_matmul_algo->name(), | |||||
m_oc_block_size); | |||||
} | |||||
return m_name.c_str(); | |||||
} | |||||
bool usable(ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
size_t get_workspace(ConvBiasImpl*, | |||||
const NCBKernSizeParam& param) const override; | |||||
SmallVector<NCBKern> dispatch_kerns( | |||||
ConvBiasImpl* opr, const NCBKernSizeParam& param) const override; | |||||
protected: | |||||
size_t get_oc_tile_size_heuristic(const NCBKernSizeParam& param) const; | |||||
private: | |||||
MatrixMulImpl::AlgoBase* m_matmul_algo; | |||||
mutable std::string m_name; | |||||
mutable size_t m_oc_block_size = 0; | |||||
}; | |||||
} // namespace fallback | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,99 @@ | |||||
/** | |||||
* \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_dispatcher.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#pragma once | |||||
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||||
namespace megdnn { | |||||
namespace fallback { | |||||
namespace conv1x1 { | |||||
namespace { | |||||
//! get_thread_bundle | |||||
WorkspaceBundle get_thread_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||||
size_t matmul_c_size, size_t oc_tile_size) { | |||||
//! for some cases, matmul result need temp space to store | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS8) || | |||||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm && | |||||
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); | |||||
size_t matmul_dst_bytes_per_thread = | |||||
is_dst_8bit ? oc_tile_size * OH * OW * sizeof(param.bias_type) : 0; | |||||
return WorkspaceBundle{nullptr, | |||||
{matmul_c_size, matmul_dst_bytes_per_thread}}; | |||||
} | |||||
} // anonymous namespace | |||||
template <MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||||
class Conv1x1Kerns { | |||||
public: | |||||
//! get_bundle | |||||
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const MatrixMulImpl::KernSizeParam& matmul_param, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
size_t oc_tile_size) { | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t BATCH = param.n; | |||||
//! bundle per thread | |||||
//! matmul_param records a matmul with M = oc_tile_size, K = IC, N = OH | |||||
//! * OW this does not bother packb bytes | |||||
auto matmul_bundle = matmul_algo->get_bundle(matmul_param); | |||||
auto thread_bundle = get_thread_bundle(param, matmul_bundle.get_size(2), | |||||
oc_tile_size); | |||||
//! size per thread | |||||
size_t all_threads_bytes = | |||||
thread_bundle.total_size_in_bytes() * param.nr_threads; | |||||
//! packa size = GROUP * packa_size_each_group | |||||
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||||
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||||
size_t all_packa_bytes = | |||||
packa_bytes_per_oc_tile * oc_tiles_per_group * GROUP; | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA) | |||||
return WorkspaceBundle{nullptr, | |||||
{all_packa_bytes, 0, all_threads_bytes}}; | |||||
//! packb size = N * GROUP * packb_size_per_group | |||||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||||
size_t all_packb_bytes = packb_bytes_per_group * GROUP * BATCH; | |||||
return WorkspaceBundle{ | |||||
nullptr, {all_packa_bytes, all_packb_bytes, all_threads_bytes}}; | |||||
} | |||||
}; | |||||
template<> | |||||
class Conv1x1Kerns<MatrixMulImpl::AlgoBase::PackMode::NO_PACK> { | |||||
public: | |||||
//! get_bundle | |||||
WorkspaceBundle get_bundle(const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const MatrixMulImpl::KernSizeParam& matmul_param, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
size_t oc_tile_size) { | |||||
size_t matmul_size = matmul_algo->get_workspace(matmul_param); | |||||
auto thread_bundle = get_thread_bundle(param, matmul_size, oc_tile_size); | |||||
//! size per thread | |||||
size_t all_threads_bytes = | |||||
thread_bundle.total_size_in_bytes() * param.nr_threads; | |||||
return WorkspaceBundle{nullptr, {0, 0, all_threads_bytes}}; | |||||
} | |||||
}; | |||||
} // namespace conv1x1 | |||||
} // namespace fallback | |||||
} // namespace megdnn |
@@ -0,0 +1,214 @@ | |||||
/** | |||||
* \file dnn/src/fallback/conv_bias/conv1x1/Conv1x1_strategy.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include <unordered_map> | |||||
#include "src/fallback/conv_bias/conv1x1/conv1x1_strategy.h" | |||||
#include "midout.h" | |||||
MIDOUT_DECL(megdnn_fallback_conv1x1_factory_strategy) | |||||
namespace megdnn { | |||||
namespace fallback { | |||||
namespace conv1x1 { | |||||
namespace { | |||||
struct StrategyHashParam { | |||||
ConvBiasImpl::NCBKernSizeParam param; | |||||
param::ConvBias::Format format; | |||||
MatrixMulImpl::AlgoBase::PackMode packmode; | |||||
}; | |||||
struct StrategyHashParamHash { | |||||
std::size_t operator()(const StrategyHashParam& sparam) const { | |||||
constexpr size_t base = 1; //! avoid hashkey is zero | |||||
std::size_t result = | |||||
static_cast<std::size_t>(sparam.param.src_type.enumv()) + base; | |||||
result = result ^ | |||||
((static_cast<std::size_t>(sparam.param.dst_type.enumv()) + | |||||
base) | |||||
<< 3); | |||||
result = result ^ | |||||
((static_cast<std::size_t>(sparam.param.filter_type.enumv()) + | |||||
base) | |||||
<< 6); | |||||
result = result ^ | |||||
((static_cast<std::size_t>(sparam.param.bias_type.enumv()) + | |||||
base) | |||||
<< 9); | |||||
result = result ^ | |||||
((static_cast<std::size_t>(sparam.format) + base) << 12); | |||||
result = result ^ | |||||
((static_cast<std::size_t>(sparam.packmode) + base) << 15); | |||||
return result; | |||||
}; | |||||
}; | |||||
struct StrategyHashParamEqual { | |||||
bool operator()(const StrategyHashParam& param1, | |||||
const StrategyHashParam& param2) const { | |||||
bool flags = true; | |||||
flags = param1.param.src_type == param2.param.src_type && flags; | |||||
flags = param1.param.filter_type == param2.param.filter_type && flags; | |||||
flags = param1.param.bias_type == param2.param.bias_type && flags; | |||||
flags = param1.param.dst_type == param2.param.dst_type && flags; | |||||
flags = param1.format == param2.format && flags; | |||||
flags = param1.packmode == param2.packmode && flags; | |||||
return flags; | |||||
}; | |||||
}; | |||||
std::unique_ptr<Conv1x1StrategyBase> create_conv1x1_strategy( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
MatrixMulImpl::AlgoBase::PackMode pack_mode, | |||||
param::ConvBias::Format format) { | |||||
MEGDNN_MARK_USED_VAR(format); | |||||
#define cb1(_packmode, _dt, _post_ctype, _postprocess_mode, _midout_tag) \ | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy, \ | |||||
midout_iv(_midout_tag)) { \ | |||||
if (param.filter_type.enumv() == DTypeTrait<_dt>::enumv) { \ | |||||
return std::make_unique< \ | |||||
Conv1x1Strategy<_dt, _dt, _dt, _post_ctype, _post_ctype, \ | |||||
_postprocess_mode, _packmode>>(); \ | |||||
} \ | |||||
} \ | |||||
MIDOUT_END() | |||||
#define cb2(_packmode, _i_src_type, _i_bias_type, _i_dst_type, _src_ctype, \ | |||||
_bias_ctype, _dst_ctype, _postprocess_mode, _midout_tag) \ | |||||
MIDOUT_BEGIN(megdnn_fallback_conv1x1_factory_strategy, \ | |||||
midout_iv(_midout_tag)) { \ | |||||
if (param.filter_type.enumv() == param.src_type.enumv() && \ | |||||
param.src_type.enumv() == DTypeTrait<_i_src_type>::enumv && \ | |||||
param.dst_type.enumv() == DTypeTrait<_i_dst_type>::enumv) { \ | |||||
return std::make_unique< \ | |||||
Conv1x1Strategy<_src_ctype, _bias_ctype, _dst_ctype, \ | |||||
DTypeTrait<_i_bias_type>::ctype, \ | |||||
DTypeTrait<_i_dst_type>::ctype, \ | |||||
_postprocess_mode, _packmode>>(); \ | |||||
} \ | |||||
} \ | |||||
MIDOUT_END() | |||||
switch (pack_mode) { | |||||
case MatrixMulImpl::AlgoBase::PackMode::DEFAULT: | |||||
cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float32, | |||||
dt_float32, PostprocessMode::FLOAT, "Default::FLOAT"_hash); | |||||
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||||
cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, __fp16, | |||||
PostprocessMode::FLOAT, "Default::FLOAT16_FP16"_hash); | |||||
#else | |||||
#if !MEGDNN_DISABLE_FLOAT16 | |||||
cb1(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_float16, | |||||
dt_float16, PostprocessMode::NO_PROCESS, | |||||
"Default::FLOAT16_FLOAT16"_hash); | |||||
#endif | |||||
#endif | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int32, | |||||
dt_int32, dt_int8, dt_int32, dt_int32, | |||||
PostprocessMode::NO_PROCESS, "Default::INT8x8x32_INT32"_hash); | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dt_int8, dt_int16, | |||||
dt_int16, dt_int8, dt_int16, dt_int16, | |||||
PostprocessMode::NO_PROCESS, "Default::INT8x8x16_INT16"_hash); | |||||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, | |||||
dtype::Quantized8Asymm, dtype::QuantizedS32, | |||||
dtype::QuantizedS32, dt_uint8, dt_int32, dt_int32, | |||||
PostprocessMode::NO_PROCESS, | |||||
"Default::QUINT8x8x32_QINT32"_hash); | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, | |||||
dtype::Quantized8Asymm, dtype::QuantizedS32, | |||||
dtype::Quantized8Asymm, dt_uint8, dt_int32, dt_uint8, | |||||
PostprocessMode::QUANTIZED, "Default::QUINT8x8x32_QUINT8"_hash); | |||||
#endif | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8, | |||||
dtype::QuantizedS32, dtype::QuantizedS32, dt_int8, dt_int32, | |||||
dt_int32, PostprocessMode::NO_PROCESS, | |||||
"Default::QINT8x8x32_QINT32"_hash); | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::DEFAULT, dtype::QuantizedS8, | |||||
dtype::QuantizedS32, dtype::QuantizedS8, dt_int8, dt_int32, | |||||
dt_int8, PostprocessMode::QUANTIZED, | |||||
"Default::QINT8x8x32_QINT8"_hash); | |||||
break; | |||||
case MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA: | |||||
cb1(MatrixMulImpl::AlgoBase::PackMode::ONLY_PACKA, dt_float32, | |||||
dt_float32, PostprocessMode::FLOAT, "OnlyPackA::FLOAT"_hash); | |||||
break; | |||||
case MatrixMulImpl::AlgoBase::PackMode::NO_PACK: | |||||
cb1(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_float32, | |||||
dt_float32, PostprocessMode::FLOAT, "NoPack::FLOAT"_hash); | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int16, | |||||
dt_int16, dt_int8, dt_int16, dt_int16, | |||||
PostprocessMode::NO_PROCESS, "NoPack::INT8x8x16_INT16"_hash); | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, dt_int8, dt_int32, | |||||
dt_int32, dt_int8, dt_int32, dt_int32, | |||||
PostprocessMode::NO_PROCESS, "NoPack::INT8x8x32_INT32"_hash); | |||||
cb2(MatrixMulImpl::AlgoBase::PackMode::NO_PACK, | |||||
dtype::QuantizedS8, dtype::QuantizedS32, | |||||
dtype::QuantizedS32, dt_int8, dt_int32, dt_int32, | |||||
PostprocessMode::NO_PROCESS, | |||||
"NoPack::QINT8x8x32_QINT32"_hash); | |||||
break; | |||||
default: | |||||
megdnn_throw("Invalid Pack Mode"); | |||||
break; | |||||
} | |||||
#undef cb1 | |||||
#undef cb2 | |||||
megdnn_throw("Invalid Data Type"); | |||||
return nullptr; | |||||
} | |||||
class StrategyDelegationStorage { | |||||
public: | |||||
Conv1x1StrategyBase* get(const ConvBiasImpl::NCBKernSizeParam& param, | |||||
MatrixMulImpl::AlgoBase::PackMode pack_mode, | |||||
param::ConvBias::Format format) { | |||||
MEGDNN_LOCK_GUARD(m_mtx); | |||||
StrategyHashParam sparam; | |||||
sparam.param = param; | |||||
sparam.format = format; | |||||
sparam.packmode = pack_mode; | |||||
if (m_map_strategies.find(sparam) == m_map_strategies.end()) { | |||||
auto strategy = create_conv1x1_strategy(param, pack_mode, format); | |||||
m_map_strategies[sparam] = std::move(strategy); | |||||
} | |||||
return m_map_strategies[sparam].get(); | |||||
} | |||||
private: | |||||
std::mutex m_mtx; | |||||
std::unordered_map<StrategyHashParam, std::unique_ptr<Conv1x1StrategyBase>, | |||||
StrategyHashParamHash, StrategyHashParamEqual> | |||||
m_map_strategies; | |||||
}; | |||||
} // anonymous namespace | |||||
Conv1x1StrategyBase* Conv1x1Factory::make_conv1x1_strategy( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
MatrixMulImpl::AlgoBase::PackMode pack_mode, | |||||
param::ConvBias::Format format) { | |||||
static StrategyDelegationStorage storage; | |||||
return storage.get(param, pack_mode, format); | |||||
} | |||||
} // namespace conv1x1 | |||||
} // namespace fallback | |||||
} // namespace megdnn |
@@ -0,0 +1,310 @@ | |||||
/** | |||||
* \file dnn/src/fallback/conv_bias/conv1x1/conv1x1_strategy.h | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||||
* implied. | |||||
*/ | |||||
#pragma once | |||||
#include "megdnn/opr_param_defs.h" | |||||
#include "src/fallback/conv_bias/opr_impl.h" | |||||
#if MEGDNN_X86 | |||||
#include "src/x86/conv_bias/postprocess_helper.h" | |||||
#endif | |||||
namespace megdnn { | |||||
namespace fallback { | |||||
namespace conv1x1 { | |||||
#if MEGDNN_X86 | |||||
using namespace x86; | |||||
#endif | |||||
namespace { | |||||
//! get_matmul_kern_param | |||||
MatrixMulImpl::KernSizeParam get_matmul_kern_param( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, size_t n, size_t m) { | |||||
size_t M = m; | |||||
size_t N = n; | |||||
size_t K = param.filter_meta.icpg; //! K = IC | |||||
size_t LDA = K, LDB = N, LDC = N; | |||||
bool is_dst_8bit = (param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS8) || | |||||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm && | |||||
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); | |||||
return {param.filter_type, | |||||
param.src_type, | |||||
is_dst_8bit ? param.bias_type : param.dst_type, | |||||
M, | |||||
N, | |||||
K, | |||||
LDA, | |||||
LDB, | |||||
LDC, | |||||
false, | |||||
false, | |||||
param::MatrixMul::ComputeMode::DEFAULT, | |||||
param::MatrixMul::Format::DEFAULT}; | |||||
} | |||||
} // namespace | |||||
class Conv1x1StrategyBase { | |||||
public: | |||||
virtual void packA(WorkspaceBundle& whole_bundle, | |||||
WorkspaceBundle& matmul_bundle, | |||||
size_t oc_tile_size, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) = 0; | |||||
virtual void packB(WorkspaceBundle& whole_bundle, | |||||
WorkspaceBundle& matmul_bundle, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) = 0; | |||||
virtual void exec(WorkspaceBundle& whole_bundle, | |||||
WorkspaceBundle& matmul_bundle, | |||||
WorkspaceBundle& thread_bundle, | |||||
size_t oc_tile_size, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) = 0; | |||||
virtual ~Conv1x1StrategyBase() = default; | |||||
}; | |||||
template <typename src_ctype, typename bias_ctype, typename dst_ctype, | |||||
typename op_ctype, typename op_dtype, | |||||
megdnn::PostprocessMode postprocess_mode, MatrixMulImpl::AlgoBase::PackMode pack_mode> | |||||
class Conv1x1Strategy : public Conv1x1StrategyBase { | |||||
public: | |||||
void packA(WorkspaceBundle& whole_bundle, | |||||
WorkspaceBundle& matmul_bundle, | |||||
size_t oc_tile_size, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) override { | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||||
megdnn_log_error("NoPack mode has no packA kernel"); | |||||
return; | |||||
} | |||||
whole_bundle.set(ncb_param.workspace_ptr); | |||||
//! packa size per group | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||||
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||||
size_t packa_bytes_per_group = | |||||
oc_tiles_per_group * packa_bytes_per_oc_tile; | |||||
size_t group_id = ncb_index.ndrange_id[0]; | |||||
size_t oc_tile_id_in_group = ncb_index.ndrange_id[1]; | |||||
size_t oc_start = oc_tile_id_in_group * oc_tile_size; | |||||
size_t oc_end = oc_start + oc_tile_size; | |||||
oc_end = (oc_end <= OC ? oc_end : OC); | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t IC = param.filter_meta.icpg; | |||||
MatrixMulImpl::KernParam matmul_kern_param; | |||||
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | |||||
get_matmul_kern_param(param, OH * OW, oc_end - oc_start); | |||||
size_t bytes_offset_of_a_panel = | |||||
group_id * packa_bytes_per_group + | |||||
oc_tile_id_in_group * packa_bytes_per_oc_tile; | |||||
size_t numbers_offset_of_filter = | |||||
oc_tile_size * IC * oc_tile_id_in_group; | |||||
src_ctype* a_panel = reinterpret_cast<src_ctype*>( | |||||
reinterpret_cast<int8_t*>(whole_bundle.get(0)) + | |||||
bytes_offset_of_a_panel); | |||||
matmul_kern_param.A_ptr = const_cast<src_ctype*>( | |||||
ncb_param.filter<src_ctype>(group_id) + | |||||
numbers_offset_of_filter); | |||||
matmul_algo->pack_A(matmul_kern_param, a_panel, 0, | |||||
oc_end - oc_start); | |||||
} | |||||
void packB(WorkspaceBundle& whole_bundle, | |||||
WorkspaceBundle& matmul_bundle, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) override { | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::DEFAULT) { | |||||
whole_bundle.set(ncb_param.workspace_ptr); | |||||
//! packb size per group | |||||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t BATCH = param.n; | |||||
size_t SH = param.filter_meta.stride[0]; | |||||
size_t SW = param.filter_meta.stride[1]; | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t OC = param.filter_meta.ocpg; | |||||
MatrixMulImpl::KernParam matmul_kern_param; | |||||
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | |||||
get_matmul_kern_param(param, OH * OW, OC); | |||||
rep(batch, BATCH) { | |||||
rep(g, GROUP) { | |||||
if (SH == 2 && SW == 2) | |||||
megdnn_throw("no support for stride = 2"); | |||||
size_t bytes_offset_of_b_panel = | |||||
batch * packb_bytes_per_group * GROUP + | |||||
g * packb_bytes_per_group; | |||||
src_ctype* b_panel = reinterpret_cast<src_ctype*>( | |||||
reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||||
bytes_offset_of_b_panel); | |||||
matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||||
ncb_param.src<src_ctype>(batch, g)); | |||||
matmul_algo->pack_B(matmul_kern_param, b_panel, 0, OH * OW); | |||||
} | |||||
} | |||||
} else { | |||||
megdnn_log_error("OnlyPackA mode and NoPack mode has no packB kernel"); | |||||
} | |||||
} | |||||
void exec(WorkspaceBundle& whole_bundle, | |||||
WorkspaceBundle& matmul_bundle, | |||||
WorkspaceBundle& thread_bundle, | |||||
size_t oc_tile_size, | |||||
const MatrixMulImpl::AlgoBase* matmul_algo, | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
const ConvBiasImpl::NCBKernParam& ncb_param, | |||||
const ConvBiasImpl::NCBKernIndex& ncb_index) override { | |||||
whole_bundle.set(ncb_param.workspace_ptr); | |||||
size_t OC = param.filter_meta.ocpg; | |||||
size_t IC = param.filter_meta.icpg; | |||||
//! packa bytes per group | |||||
size_t oc_tiles_per_group = div_ceil(OC, oc_tile_size); | |||||
size_t packa_bytes_per_oc_tile = matmul_bundle.get_size(0); | |||||
size_t packa_bytes_per_group = | |||||
packa_bytes_per_oc_tile * oc_tiles_per_group; | |||||
//! packb bytes per group | |||||
size_t packb_bytes_per_group = matmul_bundle.get_size(1); | |||||
//! matmul bytes per thread | |||||
size_t matmul_bytes_per_thread = thread_bundle.get_size(0); | |||||
size_t batch_id = ncb_index.ndrange_id[0]; | |||||
size_t group_id = ncb_index.ndrange_id[1]; | |||||
size_t oc_tile_id_in_group = ncb_index.ndrange_id[2]; | |||||
size_t thread_id = ncb_index.thread_id; | |||||
size_t GROUP = param.filter_meta.group; | |||||
size_t OH = param.osz[0]; | |||||
size_t OW = param.osz[1]; | |||||
size_t oc_start = oc_tile_size * oc_tile_id_in_group; | |||||
size_t oc_end = oc_start + oc_tile_size; | |||||
oc_end = (oc_end <= OC ? oc_end : OC); | |||||
MatrixMulImpl::KernParam matmul_kern_param; | |||||
static_cast<MatrixMulImpl::KernSizeParam&>(matmul_kern_param) = | |||||
get_matmul_kern_param(param, OH * OW, oc_end - oc_start); | |||||
size_t bytes_offset_of_a_panel = | |||||
group_id * packa_bytes_per_group + | |||||
oc_tile_id_in_group * packa_bytes_per_oc_tile; | |||||
int8_t* a_panel = reinterpret_cast<int8_t*>(whole_bundle.get(0)) + | |||||
bytes_offset_of_a_panel; | |||||
size_t bytes_offset_of_b_panel = | |||||
batch_id * packb_bytes_per_group * GROUP + | |||||
group_id * packb_bytes_per_group; | |||||
int8_t* b_panel = reinterpret_cast<int8_t*>(whole_bundle.get(1)) + | |||||
bytes_offset_of_b_panel; | |||||
size_t thread_offset = thread_bundle.total_size_in_bytes() * thread_id; | |||||
size_t bytes_offset_of_matmul_dst_this_thread = | |||||
thread_offset + thread_bundle.get_size(0); | |||||
int8_t* matmul_temp_dst = | |||||
reinterpret_cast<int8_t*>(whole_bundle.get(2)) + | |||||
bytes_offset_of_matmul_dst_this_thread; | |||||
size_t numbers_of_ncb_dst_offset = | |||||
oc_tile_size * OH * OW * oc_tile_id_in_group; | |||||
void* conv_bias_dst = static_cast<void*>( | |||||
ncb_param.dst<dst_ctype>(batch_id, group_id) + | |||||
numbers_of_ncb_dst_offset); | |||||
size_t numbers_of_ncb_filter_offset = | |||||
oc_tile_size * IC * oc_tile_id_in_group; | |||||
matmul_kern_param.A_ptr = const_cast<src_ctype*>( | |||||
ncb_param.filter<src_ctype>(group_id) + | |||||
numbers_of_ncb_filter_offset); | |||||
matmul_kern_param.B_ptr = const_cast<src_ctype*>( | |||||
ncb_param.src<src_ctype>(batch_id, group_id)); | |||||
matmul_kern_param.workspace_ptr = | |||||
reinterpret_cast<int8_t*>(whole_bundle.get(2)) + thread_offset; | |||||
matmul_kern_param.workspace_size = matmul_bytes_per_thread; | |||||
bool is_dst_8bit = | |||||
(param.src_type.enumv() == DTypeEnum::QuantizedS8 && | |||||
param.dst_type.enumv() == DTypeEnum::QuantizedS8) || | |||||
(param.src_type.enumv() == DTypeEnum::Quantized8Asymm && | |||||
param.dst_type.enumv() == DTypeEnum::Quantized8Asymm); | |||||
void* matmul_dst = is_dst_8bit ? matmul_temp_dst : conv_bias_dst; | |||||
matmul_kern_param.C_ptr = matmul_dst; | |||||
if (pack_mode == MatrixMulImpl::AlgoBase::PackMode::NO_PACK) { | |||||
auto matmul_kern = matmul_algo->get_kern(matmul_kern_param); | |||||
matmul_kern(matmul_kern_param); | |||||
} else { | |||||
auto matmul_kern_naked = | |||||
matmul_algo->get_kern_naked(matmul_kern_param); | |||||
matmul_kern_naked(matmul_kern_param, a_panel, b_panel); | |||||
} | |||||
//! do postprocess | |||||
void* bias_ptr = nullptr; | |||||
if (param.bias_mode == megdnn::BiasMode::BIAS) | |||||
bias_ptr = static_cast<void*>(const_cast<bias_ctype*>( | |||||
ncb_param.bias<bias_ctype>(batch_id, group_id) + | |||||
numbers_of_ncb_dst_offset)); | |||||
else | |||||
bias_ptr = static_cast<void*>(const_cast<bias_ctype*>( | |||||
ncb_param.bias<bias_ctype>(batch_id, group_id) + oc_start)); | |||||
PostProcess<op_ctype, op_dtype, postprocess_mode>::run( | |||||
matmul_dst, bias_ptr, conv_bias_dst, param.bias_mode, | |||||
param.nonlineMode, param.bias_type, param.dst_type, 1_z, | |||||
oc_end - oc_start, OH, OW); | |||||
} | |||||
}; | |||||
class Conv1x1Factory { | |||||
public: | |||||
static Conv1x1StrategyBase* make_conv1x1_strategy( | |||||
const ConvBiasImpl::NCBKernSizeParam& param, | |||||
MatrixMulImpl::AlgoBase::PackMode pack_mode, | |||||
param::ConvBias::Format format); | |||||
}; | |||||
} // namespace conv1x1 | |||||
} // namespace fallback | |||||
} // namespace megdnn |
@@ -15,6 +15,7 @@ | |||||
#include "src/common/opr_delegate.h" | #include "src/common/opr_delegate.h" | ||||
#include "src/common/utils.h" | #include "src/common/utils.h" | ||||
#include "src/fallback/conv_bias/algos.h" | #include "src/fallback/conv_bias/algos.h" | ||||
#include "src/fallback/conv_bias/conv1x1/algos.h" | |||||
#include "src/fallback/conv_bias/im2col/algos.h" | #include "src/fallback/conv_bias/im2col/algos.h" | ||||
#include "src/fallback/conv_bias/opr_impl.h" | #include "src/fallback/conv_bias/opr_impl.h" | ||||
#include "src/naive/convolution/algorithms.h" | #include "src/naive/convolution/algorithms.h" | ||||
@@ -54,7 +55,13 @@ public: | |||||
ohw_tile_size)); | ohw_tile_size)); | ||||
all_algos.emplace_back(refhold.back().get()); | all_algos.emplace_back(refhold.back().get()); | ||||
} | } | ||||
#if 1 | |||||
for (size_t oc_tile_size : {24, 48}) { | |||||
refhold.emplace_back(new AlgoConv1x1( | |||||
static_cast<MatrixMulImpl::AlgoBase*>(algo), | |||||
oc_tile_size)); | |||||
all_algos.emplace_back(refhold.back().get()); | |||||
} | |||||
#if 0 | |||||
//! As these algos maybe very slow, it will make fastrun search slow, so | //! As these algos maybe very slow, it will make fastrun search slow, so | ||||
//! we disable it, but for the test of strategyhelper, we just keep it. | //! we disable it, but for the test of strategyhelper, we just keep it. | ||||
//! FIXME: I do not know a better way to do it. | //! FIXME: I do not know a better way to do it. | ||||
@@ -248,6 +248,7 @@ protected: | |||||
private: | private: | ||||
class AlgoNaive; | class AlgoNaive; | ||||
class AlgoIm2col; | class AlgoIm2col; | ||||
class AlgoConv1x1; | |||||
class AlgoWinogradF32; | class AlgoWinogradF32; | ||||
class AlgoWinogradF32_4x4; | class AlgoWinogradF32_4x4; | ||||
class AlgoWinogradQS8; | class AlgoWinogradQS8; | ||||
@@ -438,7 +438,6 @@ size_t MatrixMulImpl::AlgoInt8x8x32SSEM4N8K2::get_workspace( | |||||
m, n, k, trans_a, trans_b, strategy, cacheline) | m, n, k, trans_a, trans_b, strategy, cacheline) | ||||
.get_workspace_size(); | .get_workspace_size(); | ||||
} | } | ||||
MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( | MEGDNN_REG_GEMM_FUNC_FOR_IM2COL_IMPL_DETAIL( | ||||
AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9, | AlgoInt8x8x32SSEM4N8K2, megdnn_x86_matmul_kern, 9, | ||||
x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16); | x86::matmul::gemm_sse_s8s8s32_4x8x2, dt_int8, dt_int32, dt_int16); | ||||
@@ -875,6 +875,82 @@ std::vector<conv_bias::TestArg> get_conv_bias_args( | |||||
return args; | return args; | ||||
} | } | ||||
std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args( | |||||
bool no_bias, bool no_nonlinemode, bool quantized_nlmod, | |||||
bool only_broadcast_bias) { | |||||
using namespace conv_bias; | |||||
using Param = param::ConvBias; | |||||
using NLMode = param::ConvBias::NonlineMode; | |||||
using CONVMode = param::ConvBias::Mode; | |||||
std::vector<TestArg> args; | |||||
auto pack = [&](size_t n, size_t oc, size_t ic, size_t w, size_t h, | |||||
size_t stride, NLMode nlmode, CONVMode convmode) { | |||||
Param param; | |||||
param.stride_h = stride; | |||||
param.stride_w = stride; | |||||
param.pad_h = 0; | |||||
param.pad_w = 0; | |||||
param.mode = convmode; | |||||
param.nonlineMode = nlmode; | |||||
args.emplace_back(param, TensorShape{n, ic, h, w}, | |||||
TensorShape{oc, ic, 1, 1}, TensorShape{}); | |||||
if (!no_bias) { | |||||
args.emplace_back(param, TensorShape{n, ic, h, w}, | |||||
TensorShape{oc, ic, 1, 1}, | |||||
TensorShape{1, oc, 1, 1}); | |||||
if (!only_broadcast_bias) { | |||||
args.emplace_back(param, TensorShape{n, ic, h, w}, | |||||
TensorShape{oc, ic, 1, 1}, | |||||
TensorShape{n, oc, (h - 1) / stride + 1, | |||||
(w - 1) / stride + 1}); | |||||
} | |||||
} | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
args.emplace_back(param, TensorShape{n, 2 * ic, h, w}, | |||||
TensorShape{2, oc, ic, 1, 1}, TensorShape{}); | |||||
if (!no_bias) { | |||||
args.emplace_back(param, TensorShape{n, 2 * ic, h, w}, | |||||
TensorShape{2, oc, ic, 1, 1}, | |||||
TensorShape{1, 2 * oc, 1, 1}); | |||||
if (!only_broadcast_bias) { | |||||
args.emplace_back(param, TensorShape{n, 2 * ic, h, w}, | |||||
TensorShape{2, oc, ic, 1, 1}, | |||||
TensorShape{n, 2 * oc, (h - 1) / stride + 1, | |||||
(w - 1) / stride + 1}); | |||||
} | |||||
} | |||||
}; | |||||
std::vector<NLMode> nonlinemode = {NLMode::IDENTITY}; | |||||
if (!no_nonlinemode) { | |||||
nonlinemode.emplace_back(NLMode::RELU); | |||||
nonlinemode.emplace_back(NLMode::H_SWISH); | |||||
if (!quantized_nlmod) { | |||||
nonlinemode.emplace_back(NLMode::SIGMOID); | |||||
} | |||||
} | |||||
std::vector<CONVMode> convmodes{param::ConvBias::Mode::CONVOLUTION, | |||||
param::ConvBias::Mode::CROSS_CORRELATION}; | |||||
for (size_t n : {1, 2}) | |||||
for (size_t oc : {1, 9, 33}) | |||||
for (size_t ic : {1, 16, 64}) | |||||
for (size_t size : {7, 14, 28}) | |||||
for (auto nlmode : nonlinemode) | |||||
for (auto convmode : convmodes) { | |||||
pack(n, oc, ic, size, size, 1, nlmode, convmode); | |||||
} | |||||
return args; | |||||
} | |||||
void check_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle, | void check_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle, | ||||
const char* algo_name) { | const char* algo_name) { | ||||
using namespace conv_bias; | using namespace conv_bias; | ||||
@@ -76,6 +76,10 @@ std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_args( | |||||
bool no_nonlinemode, bool quantized_nlmod = false, | bool no_nonlinemode, bool quantized_nlmod = false, | ||||
bool only_broadcast_bias = false); | bool only_broadcast_bias = false); | ||||
std::vector<megdnn::test::conv_bias::TestArg> get_conv_bias_1x1_args( | |||||
bool no_bias, bool no_nonlinemode, bool quantized_nlmod = false, | |||||
bool only_broadcast_bias = false); | |||||
void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args, | void check_conv_bias(std::vector<megdnn::test::conv_bias::TestArg> args, | ||||
megdnn::Handle* handle, const char* algo_name); | megdnn::Handle* handle, const char* algo_name); | ||||
@@ -919,6 +919,79 @@ TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_FP32_PACKA) { | |||||
#undef cb | #undef cb | ||||
} | } | ||||
/**************************** Conv1x1 PackA *************************/ | |||||
namespace { | |||||
void checker_conv_bias(std::vector<conv_bias::TestArg> args, Handle* handle, | |||||
RNG* rng, float epsilon, DType type0, DType type1, | |||||
DType type2, DType type3, const char* algo_name) { | |||||
using namespace conv_bias; | |||||
Checker<ConvBias> checker(handle); | |||||
checker.set_before_exec_callback( | |||||
conv_bias::ConvBiasAlgoChecker<ConvBias>(algo_name)); | |||||
checker.set_dtype(0, type0); | |||||
checker.set_dtype(1, type1); | |||||
checker.set_dtype(2, type2); | |||||
checker.set_dtype(4, type3); | |||||
checker.set_epsilon(epsilon); | |||||
if (NULL != rng) { | |||||
checker.set_rng(0, rng).set_rng(1, rng).set_rng(2, rng).set_rng(3, rng); | |||||
} | |||||
for (auto&& arg : args) { | |||||
checker.set_param(arg.param).execs( | |||||
{arg.src, arg.filter, arg.bias, {}, {}}); | |||||
} | |||||
} | |||||
} // namespace | |||||
#if MEGDNN_X86_WITH_MKL | |||||
TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_PACKA) { | |||||
using namespace conv_bias; | |||||
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false); | |||||
check_conv_bias(args, handle(), "CONV1x1:X86_F32_MKL_PACKA:24"); | |||||
} | |||||
TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_FP32_BLAS) { | |||||
using namespace conv_bias; | |||||
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(false, false); | |||||
check_conv_bias(args, handle(), "CONV1x1:X86_F32_BLAS:48"); | |||||
} | |||||
#endif | |||||
TEST_F(X86_MULTI_THREADS, CONV_BIAS_CONV1X1_S1_INT8X8X32) { | |||||
using namespace conv_bias; | |||||
UniformIntRNG rng{-50, 50}; | |||||
float epsilon = 0.001; | |||||
std::vector<conv_bias::TestArg> args = get_conv_bias_1x1_args(true, true); | |||||
#if MEGDNN_X86_WITH_MKL_DNN | |||||
if (x86::is_supported(x86::SIMDType::VNNI)) { | |||||
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, | |||||
dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, | |||||
"CONV1x1:X86_INT8X8X32_MKLDNN:24"); | |||||
} | |||||
#endif | |||||
#if MEGDNN_X86_WITH_VNNI | |||||
if (x86::is_supported(x86::SIMDType::VNNI)) { | |||||
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, | |||||
dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, | |||||
"CONV1x1:X86_INT8X8X32_VNNI:24"); | |||||
} | |||||
#endif | |||||
if (x86::is_supported(x86::SIMDType::AVX2)) { | |||||
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, | |||||
dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, | |||||
"CONV1x1:X86_INT8X8X32_AVX2_4X16X2:24"); | |||||
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, | |||||
dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, | |||||
"CONV1x1:X86_INT8X8X32_AVX2_2X4X16:24"); | |||||
} | |||||
checker_conv_bias(args, handle(), &rng, epsilon, dtype::Int8{}, | |||||
dtype::Int8{}, dtype::Int32{}, dtype::Int32{}, | |||||
"CONV1x1:X86_INT8X8X32_SSE_4X8X2:48"); | |||||
} | |||||
/************************* End Conv1x1 PackA ************************/ | |||||
#endif | #endif | ||||
TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { | TEST_F(X86_MULTI_THREADS, CONV_BIAS_IM2COLMATMUL_QINT8) { | ||||