@@ -387,7 +387,9 @@ public: | |||||
//! get algo name, the format is ParamTrait<T>::category:base:p.to_string() | //! get algo name, the format is ParamTrait<T>::category:base:p.to_string() | ||||
//! \warning: base must not contain :. | //! \warning: base must not contain :. | ||||
template <typename T> | template <typename T> | ||||
static std::string algo_name(const std::string& base, const T& p); | |||||
static std::string algo_name( | |||||
const std::string& base, const T& p, | |||||
param::ConvBias::Format format = param::ConvBias::Format::NCHW); | |||||
/*! | /*! | ||||
* \brief parse algo_name and get WinogradParam from algo name. | * \brief parse algo_name and get WinogradParam from algo name. | ||||
* | * | ||||
@@ -388,6 +388,166 @@ ConvBiasImpl::AlgoFP32WinogradF63_4x4::dispatch_kerns( | |||||
return {}; | return {}; | ||||
} | } | ||||
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ | |||||
bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( | |||||
fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
MEGDNN_MARK_USED_VAR(opr); | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||||
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||||
return false; | |||||
using Strategy = winograd::winograd_F23_mk4_f_nchw44; | |||||
Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||||
auto&& matmul_param = | |||||
megdnn::winograd::ConvBias<Strategy, | |||||
param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
param.osz[1], param.filter_meta.ocpg) | |||||
.get_matmul_kern_param(param); | |||||
return m_matmul_algo->usable(matmul_param) && | |||||
m_matmul_algo->packmode() == | |||||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||||
(opr->param().format == param::ConvBias::Format::NCHW44 || | |||||
(opr->param().format == | |||||
param::ConvBias::Format::NCHW44_WINOGRAD && | |||||
opr->param().output_block_size == 2 && | |||||
param.winograd_matmul_format == | |||||
param::MatrixMul::Format::MK4)) && | |||||
opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||||
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||||
param.filter_meta.spatial[0] == 3) && | |||||
(param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||||
param.filter_meta.stride[0] == 1) && | |||||
(param.filter_meta.dilation[0] == | |||||
param.filter_meta.dilation[1] && | |||||
param.filter_meta.dilation[0] == 1) && | |||||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||||
param.src_type.enumv() == DTypeEnum::Float32; | |||||
} | |||||
MIDOUT_END(); | |||||
return false; | |||||
} | |||||
size_t ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::get_workspace( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||||
winograd::winograd_F23_mk4_f_nchw44 strategy( | |||||
param.src_type, param.filter_type, param.dst_type); | |||||
return megdnn::winograd::ConvBias<winograd::winograd_F23_mk4_f_nchw44, | |||||
param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
param.osz[1], param.filter_meta.ocpg) | |||||
.get_workspace_size(param, m_matmul_algo); | |||||
} | |||||
MIDOUT_END(); | |||||
return 0; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::dispatch_kerns( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||||
winograd::winograd_F23_mk4_f_nchw44 strategy( | |||||
param.src_type, param.filter_type, param.dst_type); | |||||
auto winograd_impl = | |||||
megdnn::winograd::ConvBias<winograd::winograd_F23_mk4_f_nchw44, | |||||
param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
param.osz[1], param.filter_meta.ocpg); | |||||
return winograd_impl.get_kerns(param, m_matmul_algo); | |||||
} | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} | |||||
/* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */ | |||||
bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable( | |||||
fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MEGDNN_MARK_USED_VAR(opr); | |||||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||||
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||||
return false; | |||||
using Strategy = winograd::winograd_F63_mk4_f_nchw44; | |||||
Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||||
auto&& matmul_param = | |||||
megdnn::winograd::ConvBias<Strategy, | |||||
param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
param.osz[1], param.filter_meta.ocpg) | |||||
.get_matmul_kern_param(param); | |||||
return m_matmul_algo->usable(matmul_param) && | |||||
m_matmul_algo->packmode() == | |||||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||||
(opr->param().format == param::ConvBias::Format::NCHW44 || | |||||
(opr->param().format == | |||||
param::ConvBias::Format::NCHW44_WINOGRAD && | |||||
opr->param().output_block_size == 6 && | |||||
param.winograd_matmul_format == | |||||
param::MatrixMul::Format::MK4)) && | |||||
opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||||
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||||
param.filter_meta.spatial[0] == 3) && | |||||
(param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||||
param.filter_meta.stride[0] == 1) && | |||||
(param.filter_meta.dilation[0] == | |||||
param.filter_meta.dilation[1] && | |||||
param.filter_meta.dilation[0] == 1) && | |||||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||||
param.src_type.enumv() == DTypeEnum::Float32 && | |||||
param.filter_meta.icpg % 4 == 0 && | |||||
param.filter_meta.ocpg % 4 == 0; | |||||
} | |||||
MIDOUT_END(); | |||||
return false; | |||||
} | |||||
size_t ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::get_workspace( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||||
winograd::winograd_F63_mk4_f_nchw44 strategy( | |||||
param.src_type, param.filter_type, param.dst_type); | |||||
return megdnn::winograd::ConvBias<winograd::winograd_F63_mk4_f_nchw44, | |||||
param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
param.osz[1], param.filter_meta.ocpg) | |||||
.get_workspace_size(param, m_matmul_algo); | |||||
} | |||||
MIDOUT_END(); | |||||
return 0; | |||||
} | |||||
SmallVector<ConvBiasImpl::NCBKern> | |||||
ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::dispatch_kerns( | |||||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||||
winograd::winograd_F63_mk4_f_nchw44 strategy( | |||||
param.src_type, param.filter_type, param.dst_type); | |||||
auto winograd_impl = | |||||
megdnn::winograd::ConvBias<winograd::winograd_F63_mk4_f_nchw44, | |||||
param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||||
param.osz[1], param.filter_meta.ocpg); | |||||
return winograd_impl.get_kerns(param, m_matmul_algo); | |||||
} | |||||
MIDOUT_END(); | |||||
return {}; | |||||
} | |||||
/* ===================== direct algo ===================== */ | /* ===================== direct algo ===================== */ | ||||
MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl); | MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl); | ||||
@@ -157,6 +157,64 @@ private: | |||||
uint32_t m_tile_size; | uint32_t m_tile_size; | ||||
}; | }; | ||||
//===================== NCHW44 Winograd Support =====================// | |||||
class ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44 final : public AlgoBase { | |||||
public: | |||||
AlgoFP32WinogradF23_4x4_NCHW44( | |||||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||||
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||||
m_matmul_algo->name(), {4, 2, m_tile_size}, | |||||
param::ConvBias::Format::NCHW44); | |||||
} | |||||
return m_name.c_str(); | |||||
} | |||||
bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
size_t get_workspace(fallback::ConvBiasImpl*, | |||||
const NCBKernSizeParam& param) const override; | |||||
virtual SmallVector<NCBKern> dispatch_kerns( | |||||
fallback::ConvBiasImpl* opr, | |||||
const NCBKernSizeParam& param) const override; | |||||
private: | |||||
fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||||
mutable std::string m_name; | |||||
uint32_t m_tile_size; | |||||
}; | |||||
class ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44 final : public AlgoBase { | |||||
public: | |||||
AlgoFP32WinogradF63_4x4_NCHW44( | |||||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||||
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||||
bool is_reproducible() const override { return true; } | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||||
m_matmul_algo->name(), {4, 6, m_tile_size}, | |||||
param::ConvBias::Format::NCHW44); | |||||
} | |||||
return m_name.c_str(); | |||||
} | |||||
bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||||
size_t get_workspace(fallback::ConvBiasImpl*, | |||||
const NCBKernSizeParam& param) const override; | |||||
virtual SmallVector<NCBKern> dispatch_kerns( | |||||
fallback::ConvBiasImpl* opr, | |||||
const NCBKernSizeParam& param) const override; | |||||
private: | |||||
fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||||
mutable std::string m_name; | |||||
uint32_t m_tile_size; | |||||
}; | |||||
// ================================================================= // | |||||
class ConvBiasImpl::AlgoF32Direct final : public AlgoBase { | class ConvBiasImpl::AlgoF32Direct final : public AlgoBase { | ||||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | ||||
bool m_large_group; | bool m_large_group; | ||||
@@ -32,6 +32,12 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, | |||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, | MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, | ||||
winograd_4x5_1x1_f) | winograd_4x5_1x1_f) | ||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 2, 3, 4, 4, | |||||
winograd_F23_mk4_f_nchw44) | |||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, | |||||
winograd_F63_mk4_f_nchw44) | |||||
} // namespace winograd | } // namespace winograd | ||||
} // namespace arm_common | } // namespace arm_common | ||||
} // namespace megdnn | } // namespace megdnn | ||||
@@ -0,0 +1,349 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/fp32/strategy_f23_mk4_nchw44.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/arm_common/conv_bias/fp32/strategy.h" | |||||
#include "src/arm_common/simd_macro/marm_neon.h" | |||||
#include "src/arm_common/utils.h" | |||||
#include "src/common/unroll_macro.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/fallback/conv_bias/winograd/winograd.h" | |||||
#include "src/naive/matrix_mul/matrix_mul_helper.h" | |||||
#include "src/arm_common/elemwise_helper/op_unary.h" | |||||
#include "src/arm_common/conv_bias/fp32/helper.h" | |||||
#include "midout.h" | |||||
MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_fp32_F23_mk4) | |||||
using namespace megdnn; | |||||
using namespace arm_common; | |||||
namespace { | |||||
constexpr size_t alpha = 2 + 3 - 1; | |||||
constexpr size_t pack_size = 4; | |||||
struct InputTransformF23_NCHW44 { | |||||
template <bool inner> | |||||
static void prepare(const float* input, float* patch, float* patchT, | |||||
int ih_start, int iw_start, size_t IH, size_t IW, | |||||
size_t ic, size_t IC) { | |||||
MEGDNN_MARK_USED_VAR(patch); | |||||
size_t IW4 = IW * pack_size; | |||||
size_t iw4_start = iw_start * pack_size; | |||||
size_t icb = ic / pack_size; | |||||
if (!(inner && ic + pack_size < IC)) { | |||||
memset(patchT, 0, sizeof(float) * pack_size * alpha * alpha); | |||||
} | |||||
if (inner) { | |||||
const float* input_ptr = | |||||
input + icb * IH * IW4 + ih_start * IW4 + iw4_start; | |||||
for (size_t ih = 0; ih < alpha; ih++) { | |||||
#define cb(i) auto v##i = vld1q_f32(input_ptr + pack_size * i); | |||||
UNROLL_CALL_NOWRAPPER(4, cb); | |||||
#undef cb | |||||
#define cb(i) vst1q_f32(patchT + ih * alpha * pack_size + i * pack_size, v##i); | |||||
UNROLL_CALL_NOWRAPPER(4, cb); | |||||
#undef cb | |||||
input_ptr += IW4; | |||||
} | |||||
} else { | |||||
int ih0_act = std::max<int>(ih_start, 0), | |||||
ih1_act = std::min<int>(ih_start + alpha, IH), | |||||
iw0_act = std::max<int>(iw_start, 0), | |||||
iw1_act = std::min<int>(iw_start + alpha, IW); | |||||
const float* input_ptr = input + icb * IH * IW4; | |||||
// partial copy | |||||
for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||||
for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||||
size_t iho = ih - ih_start, iwo = iw - iw_start; | |||||
auto src = vld1q_f32(input_ptr + ih * IW4 + iw * pack_size); | |||||
vst1q_f32( | |||||
patchT + iho * alpha * pack_size + iwo * pack_size, | |||||
src); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
static void transform(const float* patchT, float* input_transform_buf, | |||||
size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||||
size_t IC) { | |||||
// BT * d * B | |||||
#define cb(m, n) \ | |||||
Vector<float, 4> d##m##n = Vector<float, 4>::load( \ | |||||
patchT + m * alpha * pack_size + n * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
#undef cb | |||||
//! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 | |||||
//! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 | |||||
//! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 | |||||
//! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 | |||||
#define cb(m) \ | |||||
auto t0##m = d0##m - d2##m; \ | |||||
auto t1##m = d1##m + d2##m; \ | |||||
auto t2##m = d2##m - d1##m; \ | |||||
auto t3##m = d3##m - d1##m; | |||||
UNROLL_CALL_NOWRAPPER(4, cb); | |||||
#undef cb | |||||
#define cb(m) \ | |||||
d##m##0 = t##m##0 - t##m##2; \ | |||||
d##m##1 = t##m##1 + t##m##2; \ | |||||
d##m##2 = t##m##2 - t##m##1; \ | |||||
d##m##3 = t##m##3 - t##m##1; | |||||
UNROLL_CALL_NOWRAPPER(4, cb); | |||||
#undef cb | |||||
size_t ICB = IC / 4; | |||||
size_t icb = ic / 4; | |||||
#define cb(m, n) \ | |||||
d##m##n.save(input_transform_buf + \ | |||||
(m * alpha + n) * ICB * nr_units_in_tile * pack_size + \ | |||||
icb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||||
#undef cb | |||||
} | |||||
}; | |||||
#define CONCAT(a, idx) a##idx | |||||
template <BiasMode bmode, typename Op> | |||||
struct OutputTransformF23_NCHW44 { | |||||
static void transform(const float* output_transform_buf, const float* bias, | |||||
float* output, float* transform_mid_buf, | |||||
size_t oh_start, size_t ow_start, size_t OH, | |||||
size_t OW, size_t oc_start, size_t oc_end, | |||||
size_t oc_index, size_t unit_idx, | |||||
size_t nr_units_in_tile, const DType& src_dtype, | |||||
const DType& dst_dtype) { | |||||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
Op op(src_dtype, dst_dtype); | |||||
//! AT * m * A | |||||
size_t OCB = (oc_end - oc_start) / pack_size; | |||||
size_t oc = oc_start + oc_index; | |||||
size_t ocb = oc_index / pack_size; | |||||
#define cb(m, n) \ | |||||
auto v##m##n = Vector<float, 4>::load( \ | |||||
output_transform_buf + \ | |||||
(m * alpha + n) * OCB * nr_units_in_tile * pack_size + \ | |||||
ocb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
#undef cb | |||||
//! 1 1 1 0 v00 v01 v02 v03 1 0 | |||||
//! 0 1 -1 1 v10 v11 v12 v13 1 1 | |||||
//! v20 v21 v22 v23 1 -1 | |||||
//! v30 v31 v32 v33 0 1 | |||||
#define cb(m) \ | |||||
auto t0##m = v0##m + v1##m + v2##m; \ | |||||
auto t1##m = v1##m - v2##m + v3##m; | |||||
UNROLL_CALL_NOWRAPPER(4, cb); | |||||
#undef cb | |||||
#define cb(m) \ | |||||
v##m##0 = t##m##0 + t##m##1 + t##m##2; \ | |||||
v##m##1 = t##m##1 - t##m##2 + t##m##3; | |||||
UNROLL_CALL_NOWRAPPER(2, cb); | |||||
#undef cb | |||||
Vector<float, 4> vbias; | |||||
if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||||
vbias = Vector<float, 4>::load(bias + oc); | |||||
#define cb(m, n) v##m##n += vbias; | |||||
UNROLL_CALL_RAW_D2(2, 2, cb); | |||||
#undef cb | |||||
} | |||||
if (bmode != BiasMode::BIAS) { | |||||
#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); | |||||
UNROLL_CALL_RAW_D2(2, 2, cb); | |||||
#undef cb | |||||
} | |||||
#define out_save(oho, owo) \ | |||||
do { \ | |||||
size_t oh = oh_start + oho; \ | |||||
size_t ow = ow_start + owo; \ | |||||
if (oh < OH && ow < OW) { \ | |||||
if (bmode == BiasMode::BIAS) { \ | |||||
v##oho##owo += Vector<float, 4>::load(bias + oc * OH * OW + \ | |||||
oh * OW * pack_size + \ | |||||
ow * pack_size); \ | |||||
v##oho##owo = op(v##oho##owo.value); \ | |||||
} \ | |||||
v##oho##owo.save(output + oc * OH * OW + oh * OW * pack_size + \ | |||||
ow * pack_size); \ | |||||
} \ | |||||
} while (0); | |||||
UNROLL_CALL_RAW_D2(2, 2, out_save); | |||||
#undef out_save | |||||
} | |||||
}; | |||||
#undef CONCAT | |||||
} // namespace | |||||
namespace megdnn { | |||||
namespace arm_common { | |||||
namespace winograd { | |||||
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_F23_mk4_f_nchw44) | |||||
void winograd_F23_mk4_f_nchw44::filter(const float* filter, | |||||
float* filter_transform_buf, | |||||
float* transform_mid_buf, size_t OC, | |||||
size_t IC, size_t oc_start, | |||||
size_t oc_end) { | |||||
//! 1 0 0 v00 v01 v02 1 0.5 0.5 0 | |||||
//! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 | |||||
//! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 | |||||
//! 0 0 1 | |||||
constexpr size_t pack_size = 4; | |||||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
megdnn_assert((oc_end - oc_start) % pack_size == 0 && | |||||
oc_start % pack_size == 0 && | |||||
oc_end % pack_size == 0 && IC % pack_size == 0 && | |||||
OC % pack_size == 0, | |||||
"NCHW44 Winograd filter transform requires both OC and IC " | |||||
"are times of 4"); | |||||
size_t OCB = OC / pack_size; | |||||
size_t ICB = IC / pack_size; | |||||
for (size_t ocb = oc_start / pack_size; ocb < oc_end / pack_size; ocb++) { | |||||
for (size_t icb = 0; icb < ICB; icb++) { | |||||
for (size_t ic_inner = 0; ic_inner < pack_size; ic_inner++) { | |||||
const float* fptr = filter + | |||||
(ocb * ICB + icb) * KERNEL_SIZE * | |||||
KERNEL_SIZE * pack_size * | |||||
pack_size + | |||||
ic_inner * pack_size; | |||||
#define cb(m, n) \ | |||||
Vector<float, 4> g##m##n = Vector<float, 4>::load( \ | |||||
fptr + (m * KERNEL_SIZE + n) * pack_size * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||||
#undef cb | |||||
#define FILTER_TRANSFORM(n, wd, g) \ | |||||
auto wd##n##0 = g##0##n; \ | |||||
tmp0 = (g##0##n + g##2##n) * 0.5; \ | |||||
tmp1 = g##1##n * 0.5; \ | |||||
auto wd##n##1 = tmp0 + tmp1; \ | |||||
auto wd##n##2 = tmp0 - tmp1; \ | |||||
auto wd##n##3 = g##2##n; | |||||
Vector<float, 4> tmp0, tmp1; | |||||
UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||||
UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); | |||||
#undef FILTER_TRANSFORM | |||||
#define cb_save(m, n) \ | |||||
ret##m##n.save(filter_transform_buf + \ | |||||
(m * ALPHA + n) * OCB * ICB * pack_size * pack_size + \ | |||||
ocb * ICB * pack_size * pack_size + \ | |||||
icb * pack_size * pack_size + ic_inner * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb_save) | |||||
#undef cb_save | |||||
} | |||||
} | |||||
} | |||||
} | |||||
void winograd_F23_mk4_f_nchw44::input(const float* input, | |||||
float* input_transform_buf, | |||||
float* transform_mid_buf, size_t IH, | |||||
size_t IW, size_t IC, size_t PH, | |||||
size_t PW, size_t unit_start_idx, | |||||
size_t nr_units_in_tile) { | |||||
megdnn_assert(IC % 4 == 0); | |||||
constexpr int alpha = 3 + 2 - 1; | |||||
// OW = IW + 2 * PW - KERNEL_SIZE + 1 | |||||
auto units_w = | |||||
div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||||
float* patch = transform_mid_buf; | |||||
float* patchT = transform_mid_buf + 4 * alpha * alpha; | |||||
for (size_t ic = 0; ic < IC; ic += 4) { | |||||
rep(unit_idx, nr_units_in_tile) { | |||||
size_t index = unit_start_idx + unit_idx; | |||||
size_t nh = index / units_w; | |||||
size_t nw = index % units_w; | |||||
int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||||
int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||||
if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||||
iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||||
InputTransformF23_NCHW44::prepare<true>(input, patch, patchT, | |||||
ih_start, iw_start, IH, | |||||
IW, ic, IC); | |||||
InputTransformF23_NCHW44::transform(patchT, input_transform_buf, | |||||
unit_idx, nr_units_in_tile, | |||||
ic, IC); | |||||
} else { | |||||
InputTransformF23_NCHW44::prepare<false>(input, patch, patchT, | |||||
ih_start, iw_start, IH, | |||||
IW, ic, IC); | |||||
InputTransformF23_NCHW44::transform(patchT, input_transform_buf, | |||||
unit_idx, nr_units_in_tile, | |||||
ic, IC); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
void winograd_F23_mk4_f_nchw44::output(const float* output_transform_buf, | |||||
const float* bias, float* output, | |||||
float* transform_mid_buf, BiasMode bmode, | |||||
NonlineMode nonline_mode, size_t OH, | |||||
size_t OW, size_t oc_start, | |||||
size_t oc_end, size_t unit_start_idx, | |||||
size_t nr_units_in_tile) { | |||||
#define cb(_bmode, _nonline_op, ...) \ | |||||
OutputTransformF23_NCHW44<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||||
__VA_ARGS__); | |||||
auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||||
constexpr size_t pack_size = 4; | |||||
size_t OC = oc_end - oc_start; | |||||
megdnn_assert(OC % pack_size == 0 && oc_start % pack_size == 0 && | |||||
oc_end % pack_size == 0, | |||||
"NCHW44 Winograd filter transform requires OC is times of 4"); | |||||
for (size_t oc = oc_start; oc < oc_end; oc += 4) { | |||||
size_t oc_index = oc - oc_start; | |||||
rep(unit_idx, nr_units_in_tile) { | |||||
size_t index = unit_start_idx + unit_idx; | |||||
auto nh = index / units_w; | |||||
auto nw = index % units_w; | |||||
size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||||
size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||||
DISPATCH_CONV_WINOGRAD_BIAS( | |||||
megdnn_arm_common_winograd_nchw44_fp32_F23_mk4, cb, float, | |||||
float, bmode, nonline_mode, output_transform_buf, bias, | |||||
output, transform_mid_buf, oh_start, ow_start, OH, OW, | |||||
oc_start, oc_end, oc_index, unit_idx, nr_units_in_tile, | |||||
src_dtype, dst_dtype); | |||||
} | |||||
} | |||||
#undef cb | |||||
} | |||||
} // namespace winograd | |||||
} // namespace arm_common | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,410 @@ | |||||
/** | |||||
* \file dnn/src/arm_common/conv_bias/fp32/strategy_f | |||||
* 63_mk4_nchw44.cpp | |||||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||||
* | |||||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||||
* | |||||
* Unless required by applicable law or agreed to in writing, | |||||
* software distributed under the License is distributed on an | |||||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
*/ | |||||
#include "src/arm_common/conv_bias/fp32/filter_transform.h" | |||||
#include "src/arm_common/conv_bias/fp32/helper.h" | |||||
#include "src/arm_common/conv_bias/fp32/strategy.h" | |||||
#include "src/arm_common/elemwise_helper/op_unary.h" | |||||
#include "src/arm_common/simd_macro/marm_neon.h" | |||||
#include "src/arm_common/utils.h" | |||||
#include "src/common/unroll_macro.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/common/winograd/winograd_helper.h" | |||||
#include "src/fallback/conv_bias/winograd/winograd.h" | |||||
#include "midout.h" | |||||
MIDOUT_DECL(megdnn_arm_common_winograd_fp32_F63_mk4) | |||||
using namespace megdnn; | |||||
using namespace arm_common; | |||||
namespace { | |||||
constexpr size_t alpha = 6 + 3 - 1; | |||||
constexpr size_t pack_size = 4; | |||||
struct InputTransformF63_NCHW44 { | |||||
template <bool inner> | |||||
static void prepare(const float* input, float* patch, float* patchT, | |||||
int ih_start, int iw_start, size_t IH, size_t IW, | |||||
size_t ic, size_t IC) { | |||||
MEGDNN_MARK_USED_VAR(patch); | |||||
size_t IW4 = IW * pack_size; | |||||
size_t iw4_start = iw_start * pack_size; | |||||
size_t icb = ic / pack_size; | |||||
if (!(inner && ic + pack_size < IC)) { | |||||
memset(patchT, 0, sizeof(float) * pack_size * alpha * alpha); | |||||
} | |||||
if (inner) { | |||||
const float* input_ptr = | |||||
input + icb * IH * IW4 + ih_start * IW4 + iw4_start; | |||||
for (size_t ih = 0; ih < alpha; ih++) { | |||||
#define cb(i) auto v##i = vld1q_f32(input_ptr + pack_size * i); | |||||
UNROLL_CALL_NOWRAPPER(8, cb); | |||||
#undef cb | |||||
#define cb(i) vst1q_f32(patchT + ih * pack_size * alpha + i * pack_size, v##i); | |||||
UNROLL_CALL_NOWRAPPER(8, cb); | |||||
#undef cb | |||||
input_ptr += IW4; | |||||
} | |||||
} else { | |||||
int ih0_act = std::max<int>(ih_start, 0), | |||||
ih1_act = std::min<int>(ih_start + alpha, IH), | |||||
iw0_act = std::max<int>(iw_start, 0), | |||||
iw1_act = std::min<int>(iw_start + alpha, IW); | |||||
const float* input_ptr = input + icb * IH * IW4; | |||||
// partial copy | |||||
for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||||
for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||||
size_t iho = ih - ih_start, iwo = iw - iw_start; | |||||
auto src = vld1q_f32(input_ptr + ih * IW4 + iw * pack_size); | |||||
vst1q_f32( | |||||
patchT + iho * pack_size * alpha + iwo * pack_size, | |||||
src); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
static void transform(const float* patchT, float* input_transform_buf, | |||||
size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||||
size_t IC) { | |||||
// BT * d * B | |||||
#define cb(m, n) \ | |||||
Vector<float, 4> d##m##n = Vector<float, 4>::load( \ | |||||
patchT + m * alpha * pack_size + n * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); | |||||
#undef cb | |||||
//! B | |||||
//! 1 0 0 0 0 0 0 0 | |||||
//! 0 1 -1 0.5 -0.5 2 -2 -1 | |||||
//! -5.25 1 1 0.25 0.25 4 4 0 | |||||
//! 0 -4.25 4.25 -2.5 2.5 -2.5 2.5 5.25 | |||||
//! 5.25 -4.25 -4.25 -1.25 -1.25 -5 -5 0 | |||||
//! 0 1 -1 2 -2 0.5 -0.5 -5.25 | |||||
//! -1 1 1 1 1 1 1 0 | |||||
//! 0 0 0 0 0 0 0 1 | |||||
#define cb(m) \ | |||||
auto t0##m = d0##m + (d4##m - d2##m) * 5.25f - d6##m; \ | |||||
auto t1##m = d1##m + d2##m + d5##m + d6##m - (d3##m + d4##m) * 4.25f; \ | |||||
auto t2##m = d2##m + d6##m - (d1##m + d5##m) + (d3##m - d4##m) * 4.25f; \ | |||||
auto t3##m = d1##m * 0.5f + d2##m * 0.25f - d3##m * 2.5f - d4##m * 1.25f + \ | |||||
d5##m * 2.f + d6##m; \ | |||||
auto t4##m = d1##m * (-0.5f) + d2##m * 0.25f + d3##m * 2.5f - \ | |||||
d4##m * 1.25f - d5##m * 2.f + d6##m; \ | |||||
auto t5##m = d1##m * 2.f + d2##m * 4.f - d3##m * 2.5f - d4##m * 5.f + \ | |||||
d5##m * 0.5f + d6##m; \ | |||||
auto t6##m = d1##m * (-2.f) + d2##m * 4.f + d3##m * 2.5f - d4##m * 5.f - \ | |||||
d5##m * 0.5f + d6##m; \ | |||||
auto t7##m = (d7##m - d1##m) + (d3##m - d5##m) * 5.25f; | |||||
UNROLL_CALL_NOWRAPPER(8, cb); | |||||
#undef cb | |||||
#define cb(m) \ | |||||
d##m##0 = t##m##0 + (t##m##4 - t##m##2) * 5.25f - t##m##6; \ | |||||
d##m##1 = t##m##1 + t##m##2 + t##m##5 + t##m##6 - \ | |||||
(t##m##3 + t##m##4) * 4.25f; \ | |||||
d##m##2 = t##m##2 + t##m##6 - (t##m##1 + t##m##5) + \ | |||||
(t##m##3 - t##m##4) * 4.25f; \ | |||||
d##m##3 = t##m##1 * 0.5f + t##m##2 * 0.25f - t##m##3 * 2.5f - \ | |||||
t##m##4 * 1.25f + t##m##5 * 2.f + t##m##6; \ | |||||
d##m##4 = t##m##1 * (-0.5f) + t##m##2 * 0.25f + t##m##3 * 2.5f - \ | |||||
t##m##4 * 1.25f - t##m##5 * 2.f + t##m##6; \ | |||||
d##m##5 = t##m##1 * 2.f + t##m##2 * 4.f - t##m##3 * 2.5f - t##m##4 * 5.f + \ | |||||
t##m##5 * 0.5f + t##m##6; \ | |||||
d##m##6 = t##m##1 * (-2.f) + t##m##2 * 4.f + t##m##3 * 2.5f - \ | |||||
t##m##4 * 5.f - t##m##5 * 0.5f + t##m##6; \ | |||||
d##m##7 = (t##m##7 - t##m##1) + (t##m##3 - t##m##5) * 5.25f; | |||||
UNROLL_CALL_NOWRAPPER(8, cb); | |||||
#undef cb | |||||
size_t ICB = IC / pack_size; | |||||
size_t icb = ic / pack_size; | |||||
#define cb(m, n) \ | |||||
d##m##n.save(input_transform_buf + \ | |||||
(m * alpha + n) * ICB * nr_units_in_tile * pack_size + \ | |||||
icb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb) | |||||
#undef cb | |||||
} | |||||
}; | |||||
template <BiasMode bmode, typename Op> | |||||
struct OutputTransformF63_NCHW44 { | |||||
static void transform(const float* output_transform_buf, const float* bias, | |||||
float* output, float* transform_mid_buf, | |||||
size_t oh_start, size_t ow_start, size_t OH, | |||||
size_t OW, size_t oc_start, size_t oc_end, | |||||
size_t oc_index, size_t unit_idx, | |||||
size_t nr_units_in_tile, const DType& src_dtype, | |||||
const DType& dst_dtype) { | |||||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
Op op(src_dtype, dst_dtype); | |||||
//! AT * m * A | |||||
size_t oc = oc_start + oc_index; | |||||
size_t OCB = (oc_end - oc_start) / pack_size; | |||||
size_t ocb = oc_index / pack_size; | |||||
#define cb(m, n) \ | |||||
auto v##m##n = Vector<float, 4>::load( \ | |||||
output_transform_buf + \ | |||||
(m * alpha + n) * OCB * nr_units_in_tile * pack_size + \ | |||||
ocb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); | |||||
#undef cb | |||||
/** | |||||
* A | |||||
* | |||||
* 1 0 0 0 0 0 | |||||
* 1 1 1 1 1 1 | |||||
* 1 -1 1 -1 1 -1 | |||||
* 1 2 4 8 16 32 | |||||
* 1 -2 4 -8 16 -32 | |||||
* 1 0.5 0.25 0.125 0.0625 0.03125 | |||||
* 1 -0.5 0.25 -0.125 0.0625 -0.03125 | |||||
* 0 0.0 0 0 0 1 | |||||
*/ | |||||
Vector<float, 4> v1addv2, v1subv2, v3addv4, v3subv4, v5addv6, v5subv6; | |||||
#define cb(m) \ | |||||
v1addv2 = v1##m + v2##m; \ | |||||
v1subv2 = v1##m - v2##m; \ | |||||
v3addv4 = v3##m + v4##m; \ | |||||
v3subv4 = v3##m - v4##m; \ | |||||
v5addv6 = v5##m + v6##m; \ | |||||
v5subv6 = v5##m - v6##m; \ | |||||
auto t0##m = v0##m + v1addv2 + v3addv4 + v5addv6; \ | |||||
auto t1##m = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; \ | |||||
auto t2##m = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; \ | |||||
auto t3##m = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; \ | |||||
auto t4##m = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; \ | |||||
auto t5##m = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + v7##m; | |||||
UNROLL_CALL_NOWRAPPER(8, cb); | |||||
#undef cb | |||||
#define cb(m) \ | |||||
v1addv2 = t##m##1 + t##m##2; \ | |||||
v1subv2 = t##m##1 - t##m##2; \ | |||||
v3addv4 = t##m##3 + t##m##4; \ | |||||
v3subv4 = t##m##3 - t##m##4; \ | |||||
v5addv6 = t##m##5 + t##m##6; \ | |||||
v5subv6 = t##m##5 - t##m##6; \ | |||||
v##m##0 = t##m##0 + v1addv2 + v3addv4 + v5addv6; \ | |||||
v##m##1 = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; \ | |||||
v##m##2 = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; \ | |||||
v##m##3 = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; \ | |||||
v##m##4 = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; \ | |||||
v##m##5 = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + t##m##7; | |||||
UNROLL_CALL_NOWRAPPER(6, cb); | |||||
#undef cb | |||||
Vector<float, 4> vbias; | |||||
if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||||
vbias = Vector<float, 4>::load(bias + oc); | |||||
#define cb(m, n) v##m##n += vbias; | |||||
UNROLL_CALL_RAW_D2(6, 6, cb); | |||||
#undef cb | |||||
} | |||||
if (bmode != BiasMode::BIAS) { | |||||
#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); | |||||
UNROLL_CALL_RAW_D2(6, 6, cb); | |||||
#undef cb | |||||
} | |||||
#define out_save(oho, owo) \ | |||||
do { \ | |||||
size_t oh = oh_start + oho; \ | |||||
size_t ow = ow_start + owo; \ | |||||
if (oh < OH && ow < OW) { \ | |||||
if (bmode == BiasMode::BIAS) { \ | |||||
v##oho##owo += Vector<float, 4>::load(bias + oc * OH * OW + \ | |||||
oh * OW * pack_size + \ | |||||
ow * pack_size); \ | |||||
v##oho##owo = op(v##oho##owo.value); \ | |||||
} \ | |||||
v##oho##owo.save(output + oc * OH * OW + oh * OW * pack_size + \ | |||||
ow * pack_size); \ | |||||
} \ | |||||
} while (0); | |||||
UNROLL_CALL_RAW_D2(6, 6, out_save); | |||||
} | |||||
#undef out_save | |||||
}; | |||||
} // namespace | |||||
namespace megdnn { | |||||
namespace arm_common { | |||||
namespace winograd { | |||||
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_F63_mk4_f_nchw44) | |||||
void winograd_F63_mk4_f_nchw44::filter(const float* filter, | |||||
float* filter_transform_buf, | |||||
float* transform_mid_buf, size_t OC, | |||||
size_t IC, size_t oc_start, | |||||
size_t oc_end) { | |||||
constexpr size_t pack_size = 4; | |||||
// Gg * GT | |||||
// G | |||||
// 1.0000000 0.0000000 0.0000000 | |||||
// -0.2222222 -0.2222222 -0.2222222 | |||||
// -0.2222222 0.2222222 -0.2222222 | |||||
// 0.0111111 0.0222222 0.0444444 | |||||
// 0.0111111 -0.0222222 0.0444444 | |||||
// 0.7111111 0.3555556 0.1777778 | |||||
// 0.7111111 -0.3555556 0.1777778 | |||||
// 0.0000000 0.0000000 1.0000000 | |||||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||||
megdnn_assert((oc_end - oc_start) % pack_size == 0 && | |||||
oc_start % pack_size == 0 && | |||||
oc_end % pack_size == 0 && IC % pack_size == 0 && | |||||
OC % pack_size == 0, | |||||
"NCHW44 Winograd filter transform requires both OC and IC " | |||||
"are times of 4"); | |||||
size_t ICB = IC / pack_size; | |||||
for (size_t ocb = oc_start / pack_size; ocb < oc_end / pack_size; ocb++) { | |||||
for (size_t icb = 0; icb < ICB; icb++) { | |||||
for (size_t ic_inner = 0; ic_inner < pack_size; ic_inner++) { | |||||
const float* fptr = filter + | |||||
(ocb * ICB + icb) * KERNEL_SIZE * | |||||
KERNEL_SIZE * pack_size * | |||||
pack_size + | |||||
ic_inner * pack_size; | |||||
#define cb(m, n) \ | |||||
Vector<float, 4> g##m##n = Vector<float, 4>::load( \ | |||||
fptr + (m * KERNEL_SIZE + n) * pack_size * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||||
#undef cb | |||||
#define FILTER_TRANSFORM(n, wd, g) \ | |||||
auto wd##n##0 = g##0##n; \ | |||||
tmp0 = (g##0##n + g##2##n) * -0.2222222f; \ | |||||
tmp1 = g##1##n * -0.2222222f; \ | |||||
auto wd##n##1 = tmp0 + tmp1; \ | |||||
auto wd##n##2 = tmp0 - tmp1; \ | |||||
tmp0 = g##0##n * 0.0111111f + g##2##n * 0.0444444f; \ | |||||
tmp1 = g##1##n * 0.0222222f; \ | |||||
auto wd##n##3 = tmp0 + tmp1; \ | |||||
auto wd##n##4 = tmp0 - tmp1; \ | |||||
tmp0 = g##0##n * 0.7111111f + g##2##n * 0.1777778f; \ | |||||
tmp1 = g##1##n * 0.3555556f; \ | |||||
auto wd##n##5 = tmp0 + tmp1; \ | |||||
auto wd##n##6 = tmp0 - tmp1; \ | |||||
auto wd##n##7 = g##2##n; | |||||
Vector<float, 4> tmp0, tmp1; | |||||
UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||||
UNROLL_CALL_RAW(8, FILTER_TRANSFORM, ret, wd); | |||||
#undef FILTER_TRANSFORM | |||||
#define cb_save(m, n) \ | |||||
ret##m##n.save(filter_transform_buf + (m * alpha + n) * OC * IC + \ | |||||
ocb * IC * pack_size + icb * pack_size * pack_size + \ | |||||
ic_inner * pack_size); | |||||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb_save) | |||||
#undef cb_save | |||||
} | |||||
} | |||||
} | |||||
} | |||||
void winograd_F63_mk4_f_nchw44::input(const float* input, | |||||
float* input_transform_buf, | |||||
float* transform_mid_buf, size_t IH, | |||||
size_t IW, size_t IC, size_t PH, | |||||
size_t PW, size_t unit_start_idx, | |||||
size_t nr_units_in_tile) { | |||||
constexpr size_t pack_size = 4; | |||||
megdnn_assert(IC % pack_size == 0); | |||||
constexpr int alpha = 3 + 6 - 1; | |||||
// OW = IW + 2 * PW - KERNEL_SIZE + 1 | |||||
auto units_w = | |||||
div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||||
float* patch = transform_mid_buf; | |||||
float* patchT = transform_mid_buf + pack_size * alpha * alpha; | |||||
for (size_t ic = 0; ic < IC; ic += pack_size) { | |||||
rep(unit_idx, nr_units_in_tile) { | |||||
size_t index = unit_start_idx + unit_idx; | |||||
size_t nh = index / units_w; | |||||
size_t nw = index % units_w; | |||||
int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||||
int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||||
if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||||
iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||||
InputTransformF63_NCHW44::prepare<true>(input, patch, patchT, | |||||
ih_start, iw_start, IH, | |||||
IW, ic, IC); | |||||
InputTransformF63_NCHW44::transform(patchT, input_transform_buf, | |||||
unit_idx, nr_units_in_tile, | |||||
ic, IC); | |||||
} else { | |||||
InputTransformF63_NCHW44::prepare<false>(input, patch, patchT, | |||||
ih_start, iw_start, IH, | |||||
IW, ic, IC); | |||||
InputTransformF63_NCHW44::transform(patchT, input_transform_buf, | |||||
unit_idx, nr_units_in_tile, | |||||
ic, IC); | |||||
} | |||||
} | |||||
} | |||||
} | |||||
void winograd_F63_mk4_f_nchw44::output(const float* output_transform_buf, | |||||
const float* bias, float* output, | |||||
float* transform_mid_buf, BiasMode bmode, | |||||
NonlineMode nonline_mode, size_t OH, | |||||
size_t OW, size_t oc_start, | |||||
size_t oc_end, size_t unit_start_idx, | |||||
size_t nr_units_in_tile) { | |||||
constexpr size_t pack_size = 4; | |||||
#define cb(_bmode, _nonline_op, ...) \ | |||||
OutputTransformF63_NCHW44<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||||
__VA_ARGS__); | |||||
auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||||
for (size_t oc = oc_start; oc < oc_end; oc += pack_size) { | |||||
size_t oc_index = oc - oc_start; | |||||
rep(unit_idx, nr_units_in_tile) { | |||||
size_t index = unit_start_idx + unit_idx; | |||||
auto nh = index / units_w; | |||||
auto nw = index % units_w; | |||||
size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||||
size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||||
DISPATCH_CONV_WINOGRAD_BIAS( | |||||
megdnn_arm_common_winograd_fp32_F63_mk4, cb, float, float, | |||||
bmode, nonline_mode, output_transform_buf, bias, output, | |||||
transform_mid_buf, oh_start, ow_start, OH, OW, oc_start, | |||||
oc_end, oc_index, unit_idx, nr_units_in_tile, src_dtype, | |||||
dst_dtype); | |||||
} | |||||
} | |||||
#undef cb | |||||
} | |||||
} // namespace winograd | |||||
} // namespace arm_common | |||||
} // namespace megdnn | |||||
// vim: syntax=cpp.doxygen |
@@ -169,6 +169,14 @@ public: | |||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
tile_size)); | tile_size)); | ||||
winograd_algos.emplace_back(refhold.back().get()); | winograd_algos.emplace_back(refhold.back().get()); | ||||
refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44( | |||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||||
tile_size)); | |||||
winograd_algos.emplace_back(refhold.back().get()); | |||||
refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44( | |||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||||
tile_size)); | |||||
winograd_algos.emplace_back(refhold.back().get()); | |||||
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | ||||
refhold.emplace_back(new AlgoFP16WinogradF23( | refhold.emplace_back(new AlgoFP16WinogradF23( | ||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
@@ -49,6 +49,9 @@ private: | |||||
class AlgoFP32WinogradF54; | class AlgoFP32WinogradF54; | ||||
class AlgoFP32WinogradF45; | class AlgoFP32WinogradF45; | ||||
class AlgoFP32WinogradF23_4x4_NCHW44; | |||||
class AlgoFP32WinogradF63_4x4_NCHW44; | |||||
class AlgoS8ChanWiseStride1NCHW44; | class AlgoS8ChanWiseStride1NCHW44; | ||||
class AlgoS8ChanWiseStride2NCHW44; | class AlgoS8ChanWiseStride2NCHW44; | ||||
@@ -28,13 +28,22 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
using namespace winograd; | using namespace winograd; | ||||
check_exec(src.layout, dst.layout, workspace.size); | check_exec(src.layout, dst.layout, workspace.size); | ||||
//! NCHW44 group conv or NCHW group conv or both dense conv | |||||
size_t flt_start = 0; | size_t flt_start = 0; | ||||
size_t pack_c_size = 1; | |||||
size_t group = 1; | size_t group = 1; | ||||
if (src.layout.ndim == 5) { | |||||
if (src.layout.ndim == 5) { //! {g, OC, IC, FH, FW} | |||||
flt_start = 1; | flt_start = 1; | ||||
group = src.layout[0]; | group = src.layout[0]; | ||||
} else if (src.layout.ndim == 6) { //! {OC/4, IC/4, FH, FW, 4, 4} | |||||
pack_c_size = src.layout[5]; | |||||
} else if (src.layout.ndim == 7) { //! {g, OC/4, IC/4, FH, FW, 4, 4} | |||||
flt_start = 1; | |||||
group = src.layout[0]; | |||||
pack_c_size = src.layout[6]; | |||||
} | } | ||||
size_t OC = src.layout[flt_start], IC = src.layout[flt_start + 1], | |||||
size_t OC = src.layout[flt_start] * pack_c_size, | |||||
IC = src.layout[flt_start + 1] * pack_c_size, | |||||
FW = src.layout[flt_start + 3]; | FW = src.layout[flt_start + 3]; | ||||
size_t m = param().output_block_size; | size_t m = param().output_block_size; | ||||
@@ -68,13 +77,23 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
float* workspace_ptr = workspace.ptr<float>(); | float* workspace_ptr = workspace.ptr<float>(); | ||||
if (FW == 3) { | if (FW == 3) { | ||||
if (m == 2) { | if (m == 2) { | ||||
DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, 0, | |||||
0); | |||||
if (pack_c_size == 1) { | |||||
DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, | |||||
0, 0); | |||||
} else if (pack_c_size == 4) { | |||||
DISPATCH(winograd_F23_mk4_f_nchw44, | |||||
param::Winograd::Format::MK4, 0, 5); | |||||
} | |||||
} else if (m == 6) { | } else if (m == 6) { | ||||
DISPATCH(winograd_6x3_1x1_f, param::Winograd::Format::DEFAULT, | DISPATCH(winograd_6x3_1x1_f, param::Winograd::Format::DEFAULT, | ||||
0, 1); | 0, 1); | ||||
DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, 0, | |||||
2); | |||||
if (pack_c_size == 1) { | |||||
DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, | |||||
0, 2); | |||||
} else if (pack_c_size == 4) { | |||||
DISPATCH(winograd_F63_mk4_f_nchw44, | |||||
param::Winograd::Format::MK4, 0, 6); | |||||
} | |||||
} | } | ||||
} else if (FW == 4) { | } else if (FW == 4) { | ||||
if (m == 5) { | if (m == 5) { | ||||
@@ -158,7 +158,10 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||||
} | } | ||||
template <typename T> | template <typename T> | ||||
struct ParamTrait; | |||||
struct NCHWParamTrait; | |||||
template <typename T> | |||||
struct NCHW44ParamTrait; | |||||
std::string ConvBias::WinogradParam::to_string() const { | std::string ConvBias::WinogradParam::to_string() const { | ||||
return ssprintf("%u:%u:%u", channel_block_size, output_block_size, | return ssprintf("%u:%u:%u", channel_block_size, output_block_size, | ||||
@@ -166,32 +169,51 @@ std::string ConvBias::WinogradParam::to_string() const { | |||||
} | } | ||||
template <typename T> | template <typename T> | ||||
std::string ConvBias::algo_name(const std::string& base, const T& p) { | |||||
return ssprintf("%s:%s:%s", ParamTrait<T>::category.c_str(), base.c_str(), | |||||
p.to_string().c_str()); | |||||
std::string ConvBias::algo_name(const std::string& base, const T& p, | |||||
param::ConvBias::Format format) { | |||||
if (format == param::ConvBias::Format::NCHW) { | |||||
return ssprintf("%s:%s:%s", NCHWParamTrait<T>::category.c_str(), | |||||
base.c_str(), p.to_string().c_str()); | |||||
} else if (format == param::ConvBias::Format::NCHW44) { | |||||
return ssprintf("%s:%s:%s", NCHW44ParamTrait<T>::category.c_str(), | |||||
base.c_str(), p.to_string().c_str()); | |||||
} | |||||
megdnn_throw("Invalid format"); | |||||
return ""; | |||||
} | } | ||||
#define FOREACH_CONV_BIAS_PARAM(cb) \ | #define FOREACH_CONV_BIAS_PARAM(cb) \ | ||||
cb(WinogradParam) cb(DirectParam) cb(MatmulParam) cb(DefaultParam) | cb(WinogradParam) cb(DirectParam) cb(MatmulParam) cb(DefaultParam) | ||||
#define cb(pt) \ | |||||
template <> \ | |||||
struct ParamTrait<ConvBias::pt> { \ | |||||
static const std::string category; \ | |||||
#define cb(pt) \ | |||||
template <> \ | |||||
struct NCHWParamTrait<ConvBias::pt> { \ | |||||
static const std::string category; \ | |||||
}; \ | |||||
template <> \ | |||||
struct NCHW44ParamTrait<ConvBias::pt> { \ | |||||
static const std::string category; \ | |||||
}; | }; | ||||
FOREACH_CONV_BIAS_PARAM(cb) | FOREACH_CONV_BIAS_PARAM(cb) | ||||
#undef cb | #undef cb | ||||
#define cb(pt, ct) const std::string ParamTrait<ConvBias::pt>::category = ct | |||||
cb(WinogradParam, "WINOGRAD"); | |||||
#define cb(pt, ct) \ | |||||
const std::string NCHWParamTrait<ConvBias::pt>::category = ct; \ | |||||
const std::string NCHW44ParamTrait<ConvBias::pt>::category = ct | |||||
cb(DirectParam, "DIRECT"); | cb(DirectParam, "DIRECT"); | ||||
cb(MatmulParam, "MATMUL"); | cb(MatmulParam, "MATMUL"); | ||||
cb(DefaultParam, "DEFAULT"); | cb(DefaultParam, "DEFAULT"); | ||||
#undef cb | #undef cb | ||||
const std::string NCHWParamTrait<ConvBias::WinogradParam>::category = | |||||
"WINOGRAD"; | |||||
const std::string NCHW44ParamTrait<ConvBias::WinogradParam>::category = | |||||
"WINOGRAD_NCHW44"; | |||||
#define cb(t) \ | #define cb(t) \ | ||||
template std::string ConvBias::algo_name<ConvBias::t>( \ | template std::string ConvBias::algo_name<ConvBias::t>( \ | ||||
const std::string& base, const ConvBias::t& p); | |||||
const std::string& base, const ConvBias::t& p, \ | |||||
param::ConvBias::Format format); | |||||
FOREACH_CONV_BIAS_PARAM(cb) | FOREACH_CONV_BIAS_PARAM(cb) | ||||
#undef cb | #undef cb | ||||
@@ -199,17 +221,37 @@ ConvBias::WinogradParam ConvBias::parse_winograd_name( | |||||
const std::string& algo_name) { | const std::string& algo_name) { | ||||
ConvBias::WinogradParam ret = INVALID_WINOGRAD_PARAM; | ConvBias::WinogradParam ret = INVALID_WINOGRAD_PARAM; | ||||
char base[128]; | char base[128]; | ||||
sscanf(algo_name.c_str(), "WINOGRAD:%[^:]:%u:%u:%u", base, | |||||
&(ret.channel_block_size), &(ret.output_block_size), | |||||
&(ret.tile_size)); | |||||
if (ret.tile_size == 0 || ret.output_block_size == 0 || | |||||
ret.channel_block_size == 0) { | |||||
megdnn_log_warn("the algo name %s is not suitable for winograd", | |||||
algo_name.c_str()); | |||||
return INVALID_WINOGRAD_PARAM; | |||||
char name[128]; | |||||
auto parse = [&](const std::string& algo_name, | |||||
const std::string& pre) -> auto { | |||||
memset(name, 0, 128); | |||||
sscanf(algo_name.c_str(), "%[^:]:%[^:]:%u:%u:%u", name, base, | |||||
&(ret.channel_block_size), &(ret.output_block_size), | |||||
&(ret.tile_size)); | |||||
if (strcmp(name, pre.c_str())) { | |||||
megdnn_log_warn("algo %s is not %s algo", name, pre.c_str()); | |||||
ret = INVALID_WINOGRAD_PARAM; | |||||
return false; | |||||
} | |||||
if (ret.tile_size == 0 || ret.output_block_size == 0 || | |||||
ret.channel_block_size == 0) { | |||||
megdnn_log_warn("the algo name %s is not suitable for %s", | |||||
algo_name.c_str(), pre.c_str()); | |||||
ret = INVALID_WINOGRAD_PARAM; | |||||
return false; | |||||
} | |||||
return true; | |||||
}; | |||||
if (parse(algo_name, "WINOGRAD_NCHW44")) { | |||||
return ret; | |||||
} else { | |||||
parse(algo_name, "WINOGRAD"); | |||||
return ret; | |||||
} | } | ||||
return ret; | |||||
} | } | ||||
constexpr ConvBias::WinogradParam ConvBias::INVALID_WINOGRAD_PARAM; | constexpr ConvBias::WinogradParam ConvBias::INVALID_WINOGRAD_PARAM; | ||||
void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | ||||
@@ -299,6 +299,7 @@ void make_canonized_filter_meta_nchwxx( | |||||
megdnn_assert(param.format == Param::Format::NCHW88 || | megdnn_assert(param.format == Param::Format::NCHW88 || | ||||
param.format == Param::Format::NCHW44 || | param.format == Param::Format::NCHW44 || | ||||
param.format == Param::Format::NCHW44_WINOGRAD || | |||||
param.format == Param::Format::NCHW88_WINOGRAD); | param.format == Param::Format::NCHW88_WINOGRAD); | ||||
size_t img_ndim = 2; | size_t img_ndim = 2; | ||||
size_t flt_start = 0; | size_t flt_start = 0; | ||||
@@ -663,6 +664,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||||
param().format == Param::Format::NCHW32 || | param().format == Param::Format::NCHW32 || | ||||
param().format == Param::Format::NCHW88 || | param().format == Param::Format::NCHW88 || | ||||
param().format == Param::Format::NCHW88_WINOGRAD || | param().format == Param::Format::NCHW88_WINOGRAD || | ||||
param().format == Param::Format::NCHW44_WINOGRAD || | |||||
param().format == Param::Format::CHWN4); | param().format == Param::Format::CHWN4); | ||||
img_dim = src.ndim - 3; | img_dim = src.ndim - 3; | ||||
if ((param().format == Param::Format::NCHW88 || | if ((param().format == Param::Format::NCHW88 || | ||||
@@ -68,6 +68,7 @@ constexpr size_t layout_pack_size(param::ConvBias::Format layout) { | |||||
case param::ConvBias::Format::NHWCD4: | case param::ConvBias::Format::NHWCD4: | ||||
return 4; | return 4; | ||||
case param::ConvBias::Format::NCHW4: | case param::ConvBias::Format::NCHW4: | ||||
case param::ConvBias::Format::NCHW44: | |||||
return 4; | return 4; | ||||
case param::ConvBias::Format::NCHW32: | case param::ConvBias::Format::NCHW32: | ||||
return 32; | return 32; | ||||
@@ -363,6 +364,7 @@ INST(uint8_t, uint8_t, int16_t, int) | |||||
_ctype, _dst_type, _input_filter_compute_type, \ | _ctype, _dst_type, _input_filter_compute_type, \ | ||||
_output_compute_type, layout, param::MatrixMul::Format::MK4>; | _output_compute_type, layout, param::MatrixMul::Format::MK4>; | ||||
INST(float, float, float, float, param::ConvBias::Format::NCHW) | INST(float, float, float, float, param::ConvBias::Format::NCHW) | ||||
INST(float, float, float, float, param::ConvBias::Format::NCHW44) | |||||
#undef INST | #undef INST | ||||
#define INST(_ctype, _dst_type, _input_filter_compute_type, \ | #define INST(_ctype, _dst_type, _input_filter_compute_type, \ | ||||
@@ -425,6 +425,7 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_pack_id, | |||||
break; | break; | ||||
} | } | ||||
case ConvBiasImpl::Param::Format::NCHW_WINOGRAD: | case ConvBiasImpl::Param::Format::NCHW_WINOGRAD: | ||||
case ConvBiasImpl::Param::Format::NCHW44_WINOGRAD: | |||||
case ConvBiasImpl::Param::Format::NCHW88_WINOGRAD: { | case ConvBiasImpl::Param::Format::NCHW88_WINOGRAD: { | ||||
//! four format of weight layout | //! four format of weight layout | ||||
//! 1. {g, alpha, alpha, ocpg/8, icpg/8, 8, 8} | //! 1. {g, alpha, alpha, ocpg/8, icpg/8, 8, 8} | ||||
@@ -198,7 +198,8 @@ void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||||
for (auto kernel : kerns) { | for (auto kernel : kerns) { | ||||
megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | ||||
param.filter_meta.format == Param::Format::NHWC || | param.filter_meta.format == Param::Format::NHWC || | ||||
param.filter_meta.format == Param::Format::NCHW88, | |||||
param.filter_meta.format == Param::Format::NCHW88 || | |||||
param.filter_meta.format == Param::Format::NCHW44, | |||||
"invalid conv format"); | "invalid conv format"); | ||||
auto run = [param, kernel](size_t index, size_t thread_id) { | auto run = [param, kernel](size_t index, size_t thread_id) { | ||||
CpuNDRange ndrange_id(kernel.global_size, index); | CpuNDRange ndrange_id(kernel.global_size, index); | ||||
@@ -137,6 +137,7 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
#undef DISPATCH_FORMAT_MK8 | #undef DISPATCH_FORMAT_MK8 | ||||
#undef DISPATCH_DTYPE | #undef DISPATCH_DTYPE | ||||
} else { | } else { | ||||
megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||||
#define cb(_ctype, _dst_type, _input_filter_compute_type, \ | #define cb(_ctype, _dst_type, _input_filter_compute_type, \ | ||||
_output_compute_type, _format, rescale) \ | _output_compute_type, _format, rescale) \ | ||||
if (param().format == _format) { \ | if (param().format == _format) { \ | ||||
@@ -158,20 +159,58 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||||
DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | ||||
DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0); \ | DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0); \ | ||||
} | } | ||||
megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||||
if (FW == 3) { | |||||
if (m == 2) { | |||||
std::vector<float> interp_points = {0, 1, -1}; | |||||
DISPATCH_DTYPE(4); | |||||
} else if (m == 6) { | |||||
std::vector<float> interp_points = {0, 1, -1, 2, -2, 0.5, -0.5}; | |||||
DISPATCH_DTYPE(5); | |||||
if (pack_c_size == 8) { //! NCHW88 | |||||
if (FW == 3) { | |||||
if (m == 2) { | |||||
std::vector<float> interp_points = {0, 1, -1}; | |||||
DISPATCH_DTYPE(4); | |||||
} else if (m == 6) { | |||||
std::vector<float> interp_points = {0, 1, -1, 2, | |||||
-2, 0.5, -0.5}; | |||||
DISPATCH_DTYPE(5); | |||||
} | |||||
} | } | ||||
#undef cb | |||||
#undef DISPATCH_FORMAT_MK8 | |||||
#undef DISPATCH_DTYPE | |||||
} | } | ||||
else if (pack_c_size == 4) { //! NCHW44 | |||||
#define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||||
_output_compute_type, _format, rescale) \ | |||||
if (param().format == _format) { \ | |||||
return winograd::StrategyHelper< \ | |||||
_ctype, _dst_type, _input_filter_compute_type, \ | |||||
_output_compute_type, param::ConvBias::Format::NCHW44, \ | |||||
_format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \ | |||||
OC, m, FW, interp_points, src.layout.dtype, \ | |||||
rescale); \ | |||||
} | |||||
#define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type, \ | |||||
_output_compute_type, _rescale) \ | |||||
cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||||
param::Winograd::Format::MK4, _rescale); | |||||
#define DISPATCH_DTYPE(_midout_tag) \ | |||||
if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | |||||
DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||||
DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||||
} | |||||
if (FW == 3) { | |||||
if (m == 2) { | |||||
std::vector<float> interp_points = {0, 1, -1}; | |||||
DISPATCH_DTYPE(6); | |||||
} else if (m == 6) { | |||||
std::vector<float> interp_points = {0, 1, -1, 2, | |||||
-2, 0.5, -0.5}; | |||||
DISPATCH_DTYPE(7); | |||||
} | |||||
} | |||||
#undef cb | #undef cb | ||||
#undef DISPATCH_FORMAT_MK8 | #undef DISPATCH_FORMAT_MK8 | ||||
#undef DISPATCH_KERNEL | #undef DISPATCH_KERNEL | ||||
#undef DISPATCH_DTYPE | #undef DISPATCH_DTYPE | ||||
} | |||||
} | } | ||||
megdnn_assert(execed, | megdnn_assert(execed, | ||||
"Unsupport winograd filter preprocess. m: %zu src: %s", m, | "Unsupport winograd filter preprocess. m: %zu src: %s", m, | ||||
@@ -699,6 +699,135 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) { | |||||
} | } | ||||
#endif | #endif | ||||
void benchmark_winograd_nchw_vs_nchw44(const char* algo_name, Handle* handle) { | |||||
using namespace conv_bias; | |||||
using NLMode = param::ConvBias::NonlineMode; | |||||
std::vector<conv_bias::TestArg> args_nchw44; | |||||
std::vector<conv_bias::TestArg> args_nchw; | |||||
auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, | |||||
size_t group, NLMode nlmode) { | |||||
param::ConvBias param; | |||||
param.format = param::ConvBias::Format::NCHW44; | |||||
param.stride_h = 1; | |||||
param.stride_w = 1; | |||||
param.pad_h = 1; | |||||
param.pad_w = 1; | |||||
param.nonlineMode = nlmode; | |||||
if (group == 1) { | |||||
param.sparse = param::ConvBias::Sparse::DENSE; | |||||
args_nchw44.emplace_back(param, TensorShape{n, ic / 4, h, w, 4}, | |||||
TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, | |||||
TensorShape{}); | |||||
param.format = param::ConvBias::Format::NCHW; | |||||
args_nchw.emplace_back(param, TensorShape{n, ic, h, w}, | |||||
TensorShape{oc, ic, 3, 3}, TensorShape{}); | |||||
} else { | |||||
auto oc_per_group = oc / group; | |||||
auto ic_per_group = ic / group; | |||||
param.sparse = param::ConvBias::Sparse::GROUP; | |||||
args_nchw44.emplace_back(param, | |||||
TensorShape{n, ic_per_group / 4, h, w, 4}, | |||||
TensorShape{group, oc_per_group / 4, | |||||
ic_per_group / 4, 3, 3, 4, 4}, | |||||
TensorShape{}); | |||||
param.format = param::ConvBias::Format::NCHW; | |||||
args_nchw.emplace_back( | |||||
param, TensorShape{n, ic, h, w}, | |||||
TensorShape{group, oc_per_group, ic_per_group, 3, 3}, | |||||
TensorShape{}); | |||||
} | |||||
}; | |||||
std::vector<NLMode> nonlinemode = {NLMode::IDENTITY}; | |||||
for (auto nlmode : nonlinemode) | |||||
for (size_t n : {1, 2}) | |||||
for (size_t group = 1; group <= 2; ++group) { | |||||
pack(n, 512, 512, 15, 15, group, nlmode); | |||||
pack(n, 512, 256, 15, 15, group, nlmode); | |||||
pack(n, 256, 256, 29, 29, group, nlmode); | |||||
pack(n, 256, 128, 29, 29, group, nlmode); | |||||
pack(n, 128, 128, 57, 57, group, nlmode); | |||||
pack(n, 128, 64, 57, 57, group, nlmode); | |||||
pack(n, 24, 24, 224, 224, group, nlmode); | |||||
pack(n, 64, 24, 123, 123, group, nlmode); | |||||
pack(n, 64, 64, 56, 56, group, nlmode); | |||||
pack(n, 128, 128, 28, 28, group, nlmode); | |||||
pack(n, 256, 256, 14, 14, group, nlmode); | |||||
pack(n, 512, 512, 7, 7, group, nlmode); | |||||
} | |||||
using namespace conv_bias; | |||||
constexpr size_t RUN = 10; | |||||
Benchmarker<ConvBias> benchmark_winograd_nchw(handle); | |||||
benchmark_winograd_nchw.set_display(false); | |||||
benchmark_winograd_nchw.set_times(RUN); | |||||
Benchmarker<ConvBias> benchmark_winograd_nchw44(handle); | |||||
benchmark_winograd_nchw44.set_display(false); | |||||
benchmark_winograd_nchw44.set_times(RUN); | |||||
std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name); | |||||
std::string winograd_nchw44_algo_name = | |||||
ssprintf("WINOGRAD_NCHW44:%s", algo_name); | |||||
for (size_t i = 0; i < args_nchw.size(); ++i) { | |||||
auto arg_nchw = args_nchw[i]; | |||||
auto arg_nchw44 = args_nchw44[i]; | |||||
TensorLayout dst_layout; | |||||
auto opr = handle->create_operator<ConvBias>(); | |||||
opr->param() = arg_nchw.param; | |||||
opr->deduce_layout({arg_nchw.src, dtype::Float32()}, | |||||
{arg_nchw.filter, dtype::Float32()}, | |||||
{arg_nchw.bias, dtype::Float32()}, {}, dst_layout); | |||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] * | |||||
arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 / | |||||
(1024 * 1024 * 1024) * 1e3; | |||||
benchmark_winograd_nchw.set_param(arg_nchw.param); | |||||
auto nchw_used = algo_benchmark<ConvBias>( | |||||
benchmark_winograd_nchw, | |||||
{arg_nchw.src, arg_nchw.filter, {}, {}, {}}, | |||||
winograd_nchw_algo_name.c_str()) / | |||||
RUN; | |||||
benchmark_winograd_nchw44.set_param(arg_nchw44.param); | |||||
auto nchw44_used = | |||||
algo_benchmark<ConvBias>( | |||||
benchmark_winograd_nchw44, | |||||
{arg_nchw44.src, arg_nchw44.filter, {}, {}, {}}, | |||||
winograd_nchw44_algo_name.c_str()) / | |||||
RUN; | |||||
printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops " | |||||
"speedup: " | |||||
"%f\n", | |||||
arg_nchw.src.to_string().c_str(), | |||||
arg_nchw.filter.to_string().c_str(), nchw_used, | |||||
computations / nchw_used, nchw44_used, | |||||
computations / nchw44_used, nchw_used / nchw44_used); | |||||
} | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) { | |||||
#if MEGDNN_AARCH64 | |||||
benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:2", handle()); | |||||
#else | |||||
benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:2", handle()); | |||||
#endif | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) { | |||||
#if MEGDNN_AARCH64 | |||||
benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:6", handle()); | |||||
#else | |||||
benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:6", handle()); | |||||
#endif | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) { | TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) { | ||||
auto benchmark_winograd_quantized = [](const char* algo_name_fp32, | auto benchmark_winograd_quantized = [](const char* algo_name_fp32, | ||||
const char* algo_name_quantized, | const char* algo_name_quantized, | ||||
@@ -654,6 +654,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4) { | |||||
check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4); | check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4); | ||||
} | } | ||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4_NCHW44) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args = get_nchw44_conv_bias_args({3}, 1); | |||||
Checker<ConvBiasForward> checker(handle()); | |||||
check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4, | |||||
param::ConvBias::Format::NCHW44); | |||||
} | |||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63) { | ||||
using namespace conv_bias; | using namespace conv_bias; | ||||
std::vector<TestArg> args = get_winograd_args(3); | std::vector<TestArg> args = get_winograd_args(3); | ||||
@@ -667,7 +676,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4) { | |||||
std::vector<TestArg> args = get_winograd_mk_packed_args(); | std::vector<TestArg> args = get_winograd_mk_packed_args(); | ||||
Checker<ConvBiasForward> checker(handle()); | Checker<ConvBiasForward> checker(handle()); | ||||
check_winograd("4:6:32", checker, args, param::MatrixMul::Format::MK4); | |||||
check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); | |||||
} | |||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_NCHW44) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args = get_nchw44_conv_bias_args({3}, 1); | |||||
Checker<ConvBiasForward> checker(handle()); | |||||
check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4, | |||||
param::ConvBias::Format::NCHW44); | |||||
} | } | ||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54) { | ||||
@@ -761,6 +778,75 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||||
#endif | #endif | ||||
} | } | ||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> nchw44_args = get_nchw44_conv_bias_args({3}, 1); | |||||
Checker<ConvBiasForward> checker(handle()); | |||||
auto extra_impl = [](const TensorNDArray& tensors, uint32_t m, | |||||
param::ConvBias param, Handle* handle) { | |||||
megdnn_assert(param.format == param::ConvBias::Format::NCHW44); | |||||
auto winograd_preprocess_opr = | |||||
handle->create_operator<WinogradFilterPreprocess>(); | |||||
winograd_preprocess_opr->param().output_block_size = m; | |||||
winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK4; | |||||
TensorLayout filter_transform_layout; | |||||
winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||||
filter_transform_layout); | |||||
size_t winograd_preprocess_workspace_in_bytes = | |||||
winograd_preprocess_opr->get_workspace_in_bytes( | |||||
tensors[1].layout, filter_transform_layout); | |||||
auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||||
conv_bias_opr->param() = param; | |||||
conv_bias_opr->param().format = param::ConvBias::Format::NCHW44_WINOGRAD; | |||||
conv_bias_opr->param().output_block_size = m; | |||||
size_t conv_bias_workspace_in_bytes = | |||||
conv_bias_opr->get_workspace_in_bytes( | |||||
tensors[0].layout, filter_transform_layout, | |||||
tensors[2].layout, tensors[3].layout, | |||||
tensors[4].layout, nullptr); | |||||
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||||
conv_bias_workspace_in_bytes, | |||||
winograd_preprocess_workspace_in_bytes}); | |||||
wb.set(malloc(wb.total_size_in_bytes())); | |||||
TensorND filter_transform_tensor(wb.get(0), | |||||
std::move(filter_transform_layout)); | |||||
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||||
wb.get_workspace(2)); | |||||
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||||
tensors[3], tensors[4], nullptr, | |||||
wb.get_workspace(1)); | |||||
free(wb.ptr()); | |||||
}; | |||||
auto run = [&checker, &extra_impl]( | |||||
Handle* handle, const std::vector<TestArg>& args, | |||||
const std::vector<size_t>& out_size, DType A_dtype, | |||||
DType B_dtype, DType C_dtype, DType D_dtype, | |||||
const float eps) { | |||||
for (auto&& arg : args) { | |||||
for (uint32_t m : out_size) { | |||||
checker.set_extra_opr_impl(std::bind(extra_impl, | |||||
std::placeholders::_1, m, | |||||
arg.param, handle)); | |||||
checker.set_dtype(0, A_dtype) | |||||
.set_dtype(1, B_dtype) | |||||
.set_dtype(2, C_dtype) | |||||
.set_dtype(4, D_dtype) | |||||
.set_epsilon(eps) | |||||
.set_param(arg.param) | |||||
.execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||||
} | |||||
} | |||||
}; | |||||
run(handle(), nchw44_args, {2, 6}, dtype::Float32(), dtype::Float32(), | |||||
dtype::Float32(), dtype::Float32(), 1e-3f); | |||||
} | |||||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) { | TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) { | ||||
using namespace conv_bias; | using namespace conv_bias; | ||||
@@ -72,7 +72,7 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||||
auto&& inputs = conv_bias_opr.input(); | auto&& inputs = conv_bias_opr.input(); | ||||
VarNodeArray new_inp; | VarNodeArray new_inp; | ||||
new_inp.reserve(inputs.size()); | new_inp.reserve(inputs.size()); | ||||
for (auto i: inputs) { | |||||
for (auto i : inputs) { | |||||
new_inp.push_back(rewriter.get_var(i)); | new_inp.push_back(rewriter.get_var(i)); | ||||
} | } | ||||
@@ -86,11 +86,15 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||||
megdnn::ConvBias::parse_winograd_name(algo_name); | megdnn::ConvBias::parse_winograd_name(algo_name); | ||||
if (winograd_param == megdnn::ConvBias::INVALID_WINOGRAD_PARAM) | if (winograd_param == megdnn::ConvBias::INVALID_WINOGRAD_PARAM) | ||||
break; | break; | ||||
mgb_assert(conv_bias_opr.param().format == | |||||
megdnn::ConvBias::Param::Format::NCHW || | |||||
conv_bias_opr.param().format == | |||||
megdnn::ConvBias::Param::Format::NCHW88, | |||||
"currently winograd only suppport NCHW and nchw88"); | |||||
mgb_assert( | |||||
conv_bias_opr.param().format == | |||||
megdnn::ConvBias::Param::Format::NCHW || | |||||
conv_bias_opr.param().format == | |||||
megdnn::ConvBias::Param::Format::NCHW88 || | |||||
conv_bias_opr.param().format == | |||||
megdnn::ConvBias::Param::Format::NCHW44, | |||||
"currently winograd only suppport NCHW and NCHW44 and " | |||||
"NCHW88"); | |||||
opr::ConvBiasForward::check_winograd_param_valid( | opr::ConvBiasForward::check_winograd_param_valid( | ||||
winograd_param, conv_bias_opr.input(0)->dtype()); | winograd_param, conv_bias_opr.input(0)->dtype()); | ||||
megdnn::param::Winograd winograd_preprocess_param; | megdnn::param::Winograd winograd_preprocess_param; | ||||
@@ -110,8 +114,17 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||||
megdnn::ConvBias::Param::Format::NCHW_WINOGRAD; | megdnn::ConvBias::Param::Format::NCHW_WINOGRAD; | ||||
} else { | } else { | ||||
mgb_assert(new_inp[0]->shape().ndim == 5); | mgb_assert(new_inp[0]->shape().ndim == 5); | ||||
conv_bias_param.format = | |||||
megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||||
size_t pack_size = new_inp[0]->shape()[4]; | |||||
if (pack_size == 8) { | |||||
conv_bias_param.format = | |||||
megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||||
} else if (pack_size == 4) { | |||||
conv_bias_param.format = | |||||
megdnn::ConvBias::Param::Format::NCHW44_WINOGRAD; | |||||
} else { | |||||
mgb_assert(0, "Invalid pack size %zu in algo %s", pack_size, | |||||
algo_name.c_str()); | |||||
} | |||||
} | } | ||||
conv_bias_param.output_block_size = | conv_bias_param.output_block_size = | ||||
winograd_param.output_block_size; | winograd_param.output_block_size; | ||||