@@ -387,7 +387,9 @@ public: | |||
//! get algo name, the format is ParamTrait<T>::category:base:p.to_string() | |||
//! \warning: base must not contain :. | |||
template <typename T> | |||
static std::string algo_name(const std::string& base, const T& p); | |||
static std::string algo_name( | |||
const std::string& base, const T& p, | |||
param::ConvBias::Format format = param::ConvBias::Format::NCHW); | |||
/*! | |||
* \brief parse algo_name and get WinogradParam from algo name. | |||
* | |||
@@ -388,6 +388,166 @@ ConvBiasImpl::AlgoFP32WinogradF63_4x4::dispatch_kerns( | |||
return {}; | |||
} | |||
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ | |||
bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( | |||
fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||
MEGDNN_MARK_USED_VAR(opr); | |||
MEGDNN_MARK_USED_VAR(param); | |||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||
return false; | |||
using Strategy = winograd::winograd_F23_mk4_f_nchw44; | |||
Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||
auto&& matmul_param = | |||
megdnn::winograd::ConvBias<Strategy, | |||
param::MatrixMul::Format::MK4>( | |||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
param.osz[1], param.filter_meta.ocpg) | |||
.get_matmul_kern_param(param); | |||
return m_matmul_algo->usable(matmul_param) && | |||
m_matmul_algo->packmode() == | |||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
(opr->param().format == param::ConvBias::Format::NCHW44 || | |||
(opr->param().format == | |||
param::ConvBias::Format::NCHW44_WINOGRAD && | |||
opr->param().output_block_size == 2 && | |||
param.winograd_matmul_format == | |||
param::MatrixMul::Format::MK4)) && | |||
opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
param.filter_meta.spatial[0] == 3) && | |||
(param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||
param.filter_meta.stride[0] == 1) && | |||
(param.filter_meta.dilation[0] == | |||
param.filter_meta.dilation[1] && | |||
param.filter_meta.dilation[0] == 1) && | |||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||
param.src_type.enumv() == DTypeEnum::Float32; | |||
} | |||
MIDOUT_END(); | |||
return false; | |||
} | |||
size_t ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::get_workspace( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
MEGDNN_MARK_USED_VAR(param); | |||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||
winograd::winograd_F23_mk4_f_nchw44 strategy( | |||
param.src_type, param.filter_type, param.dst_type); | |||
return megdnn::winograd::ConvBias<winograd::winograd_F23_mk4_f_nchw44, | |||
param::MatrixMul::Format::MK4>( | |||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
param.osz[1], param.filter_meta.ocpg) | |||
.get_workspace_size(param, m_matmul_algo); | |||
} | |||
MIDOUT_END(); | |||
return 0; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> | |||
ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::dispatch_kerns( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
MEGDNN_MARK_USED_VAR(param); | |||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
midout_iv("AlgoFP32WinogradF23_4x4_NCHW44"_hash)) { | |||
winograd::winograd_F23_mk4_f_nchw44 strategy( | |||
param.src_type, param.filter_type, param.dst_type); | |||
auto winograd_impl = | |||
megdnn::winograd::ConvBias<winograd::winograd_F23_mk4_f_nchw44, | |||
param::MatrixMul::Format::MK4>( | |||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
param.osz[1], param.filter_meta.ocpg); | |||
return winograd_impl.get_kerns(param, m_matmul_algo); | |||
} | |||
MIDOUT_END(); | |||
return {}; | |||
} | |||
/* =================== AlgoFP32WinogradF63_4x4_NCHW44 ===================== */ | |||
bool ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::usable( | |||
fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||
MEGDNN_MARK_USED_VAR(param); | |||
MEGDNN_MARK_USED_VAR(opr); | |||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||
return false; | |||
using Strategy = winograd::winograd_F63_mk4_f_nchw44; | |||
Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||
auto&& matmul_param = | |||
megdnn::winograd::ConvBias<Strategy, | |||
param::MatrixMul::Format::MK4>( | |||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
param.osz[1], param.filter_meta.ocpg) | |||
.get_matmul_kern_param(param); | |||
return m_matmul_algo->usable(matmul_param) && | |||
m_matmul_algo->packmode() == | |||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||
(opr->param().format == param::ConvBias::Format::NCHW44 || | |||
(opr->param().format == | |||
param::ConvBias::Format::NCHW44_WINOGRAD && | |||
opr->param().output_block_size == 6 && | |||
param.winograd_matmul_format == | |||
param::MatrixMul::Format::MK4)) && | |||
opr->param().mode == param::ConvBias::Mode::CROSS_CORRELATION && | |||
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||
param.filter_meta.spatial[0] == 3) && | |||
(param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||
param.filter_meta.stride[0] == 1) && | |||
(param.filter_meta.dilation[0] == | |||
param.filter_meta.dilation[1] && | |||
param.filter_meta.dilation[0] == 1) && | |||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||
param.src_type.enumv() == DTypeEnum::Float32 && | |||
param.filter_meta.icpg % 4 == 0 && | |||
param.filter_meta.ocpg % 4 == 0; | |||
} | |||
MIDOUT_END(); | |||
return false; | |||
} | |||
size_t ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::get_workspace( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
MEGDNN_MARK_USED_VAR(param); | |||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||
winograd::winograd_F63_mk4_f_nchw44 strategy( | |||
param.src_type, param.filter_type, param.dst_type); | |||
return megdnn::winograd::ConvBias<winograd::winograd_F63_mk4_f_nchw44, | |||
param::MatrixMul::Format::MK4>( | |||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
param.osz[1], param.filter_meta.ocpg) | |||
.get_workspace_size(param, m_matmul_algo); | |||
} | |||
MIDOUT_END(); | |||
return 0; | |||
} | |||
SmallVector<ConvBiasImpl::NCBKern> | |||
ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44::dispatch_kerns( | |||
fallback::ConvBiasImpl*, const NCBKernSizeParam& param) const { | |||
MEGDNN_MARK_USED_VAR(param); | |||
MIDOUT_BEGIN(megdnn_arm_common_winograd_fp32, | |||
midout_iv("AlgoFP32WinogradF63_4x4_NCHW44"_hash)) { | |||
winograd::winograd_F63_mk4_f_nchw44 strategy( | |||
param.src_type, param.filter_type, param.dst_type); | |||
auto winograd_impl = | |||
megdnn::winograd::ConvBias<winograd::winograd_F63_mk4_f_nchw44, | |||
param::MatrixMul::Format::MK4>( | |||
strategy, m_tile_size, param.nr_threads, param.osz[0], | |||
param.osz[1], param.filter_meta.ocpg); | |||
return winograd_impl.get_kerns(param, m_matmul_algo); | |||
} | |||
MIDOUT_END(); | |||
return {}; | |||
} | |||
/* ===================== direct algo ===================== */ | |||
MIDOUT_DECL(megdnn_arm_common_conv_bias_f32_kimpl); | |||
@@ -157,6 +157,64 @@ private: | |||
uint32_t m_tile_size; | |||
}; | |||
//===================== NCHW44 Winograd Support =====================// | |||
class ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44 final : public AlgoBase { | |||
public: | |||
AlgoFP32WinogradF23_4x4_NCHW44( | |||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||
bool is_reproducible() const override { return true; } | |||
const char* name() const override { | |||
if (m_name.empty()) { | |||
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||
m_matmul_algo->name(), {4, 2, m_tile_size}, | |||
param::ConvBias::Format::NCHW44); | |||
} | |||
return m_name.c_str(); | |||
} | |||
bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||
size_t get_workspace(fallback::ConvBiasImpl*, | |||
const NCBKernSizeParam& param) const override; | |||
virtual SmallVector<NCBKern> dispatch_kerns( | |||
fallback::ConvBiasImpl* opr, | |||
const NCBKernSizeParam& param) const override; | |||
private: | |||
fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||
mutable std::string m_name; | |||
uint32_t m_tile_size; | |||
}; | |||
class ConvBiasImpl::AlgoFP32WinogradF63_4x4_NCHW44 final : public AlgoBase { | |||
public: | |||
AlgoFP32WinogradF63_4x4_NCHW44( | |||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||
bool is_reproducible() const override { return true; } | |||
const char* name() const override { | |||
if (m_name.empty()) { | |||
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||
m_matmul_algo->name(), {4, 6, m_tile_size}, | |||
param::ConvBias::Format::NCHW44); | |||
} | |||
return m_name.c_str(); | |||
} | |||
bool usable(fallback::ConvBiasImpl* opr, const NCBKernSizeParam& param, | |||
AlgoSelectionStrategy algo_selection_strategy) const override; | |||
size_t get_workspace(fallback::ConvBiasImpl*, | |||
const NCBKernSizeParam& param) const override; | |||
virtual SmallVector<NCBKern> dispatch_kerns( | |||
fallback::ConvBiasImpl* opr, | |||
const NCBKernSizeParam& param) const override; | |||
private: | |||
fallback::MatrixMulImpl::AlgoBase* m_matmul_algo; | |||
mutable std::string m_name; | |||
uint32_t m_tile_size; | |||
}; | |||
// ================================================================= // | |||
class ConvBiasImpl::AlgoF32Direct final : public AlgoBase { | |||
SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const; | |||
bool m_large_group; | |||
@@ -32,6 +32,12 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, | |||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, | |||
winograd_4x5_1x1_f) | |||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 2, 3, 4, 4, | |||
winograd_F23_mk4_f_nchw44) | |||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, | |||
winograd_F63_mk4_f_nchw44) | |||
} // namespace winograd | |||
} // namespace arm_common | |||
} // namespace megdnn | |||
@@ -0,0 +1,349 @@ | |||
/** | |||
* \file dnn/src/arm_common/conv_bias/fp32/strategy_f23_mk4_nchw44.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "src/arm_common/conv_bias/fp32/strategy.h" | |||
#include "src/arm_common/simd_macro/marm_neon.h" | |||
#include "src/arm_common/utils.h" | |||
#include "src/common/unroll_macro.h" | |||
#include "src/common/utils.h" | |||
#include "src/fallback/conv_bias/winograd/winograd.h" | |||
#include "src/naive/matrix_mul/matrix_mul_helper.h" | |||
#include "src/arm_common/elemwise_helper/op_unary.h" | |||
#include "src/arm_common/conv_bias/fp32/helper.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megdnn_arm_common_winograd_nchw44_fp32_F23_mk4) | |||
using namespace megdnn; | |||
using namespace arm_common; | |||
namespace { | |||
constexpr size_t alpha = 2 + 3 - 1; | |||
constexpr size_t pack_size = 4; | |||
struct InputTransformF23_NCHW44 { | |||
template <bool inner> | |||
static void prepare(const float* input, float* patch, float* patchT, | |||
int ih_start, int iw_start, size_t IH, size_t IW, | |||
size_t ic, size_t IC) { | |||
MEGDNN_MARK_USED_VAR(patch); | |||
size_t IW4 = IW * pack_size; | |||
size_t iw4_start = iw_start * pack_size; | |||
size_t icb = ic / pack_size; | |||
if (!(inner && ic + pack_size < IC)) { | |||
memset(patchT, 0, sizeof(float) * pack_size * alpha * alpha); | |||
} | |||
if (inner) { | |||
const float* input_ptr = | |||
input + icb * IH * IW4 + ih_start * IW4 + iw4_start; | |||
for (size_t ih = 0; ih < alpha; ih++) { | |||
#define cb(i) auto v##i = vld1q_f32(input_ptr + pack_size * i); | |||
UNROLL_CALL_NOWRAPPER(4, cb); | |||
#undef cb | |||
#define cb(i) vst1q_f32(patchT + ih * alpha * pack_size + i * pack_size, v##i); | |||
UNROLL_CALL_NOWRAPPER(4, cb); | |||
#undef cb | |||
input_ptr += IW4; | |||
} | |||
} else { | |||
int ih0_act = std::max<int>(ih_start, 0), | |||
ih1_act = std::min<int>(ih_start + alpha, IH), | |||
iw0_act = std::max<int>(iw_start, 0), | |||
iw1_act = std::min<int>(iw_start + alpha, IW); | |||
const float* input_ptr = input + icb * IH * IW4; | |||
// partial copy | |||
for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||
for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||
size_t iho = ih - ih_start, iwo = iw - iw_start; | |||
auto src = vld1q_f32(input_ptr + ih * IW4 + iw * pack_size); | |||
vst1q_f32( | |||
patchT + iho * alpha * pack_size + iwo * pack_size, | |||
src); | |||
} | |||
} | |||
} | |||
} | |||
static void transform(const float* patchT, float* input_transform_buf, | |||
size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||
size_t IC) { | |||
// BT * d * B | |||
#define cb(m, n) \ | |||
Vector<float, 4> d##m##n = Vector<float, 4>::load( \ | |||
patchT + m * alpha * pack_size + n * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||
#undef cb | |||
//! 1 0 -1 0 d00 d01 d02 d03 1 0 0 0 | |||
//! 0 1 1 0 d10 d11 d12 d13 0 1 -1 -1 | |||
//! 0 -1 1 0 d20 d21 d22 d23 -1 1 1 0 | |||
//! 0 -1 0 1 d30 d31 d32 d33 0 0 0 1 | |||
#define cb(m) \ | |||
auto t0##m = d0##m - d2##m; \ | |||
auto t1##m = d1##m + d2##m; \ | |||
auto t2##m = d2##m - d1##m; \ | |||
auto t3##m = d3##m - d1##m; | |||
UNROLL_CALL_NOWRAPPER(4, cb); | |||
#undef cb | |||
#define cb(m) \ | |||
d##m##0 = t##m##0 - t##m##2; \ | |||
d##m##1 = t##m##1 + t##m##2; \ | |||
d##m##2 = t##m##2 - t##m##1; \ | |||
d##m##3 = t##m##3 - t##m##1; | |||
UNROLL_CALL_NOWRAPPER(4, cb); | |||
#undef cb | |||
size_t ICB = IC / 4; | |||
size_t icb = ic / 4; | |||
#define cb(m, n) \ | |||
d##m##n.save(input_transform_buf + \ | |||
(m * alpha + n) * ICB * nr_units_in_tile * pack_size + \ | |||
icb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb) | |||
#undef cb | |||
} | |||
}; | |||
#define CONCAT(a, idx) a##idx | |||
template <BiasMode bmode, typename Op> | |||
struct OutputTransformF23_NCHW44 { | |||
static void transform(const float* output_transform_buf, const float* bias, | |||
float* output, float* transform_mid_buf, | |||
size_t oh_start, size_t ow_start, size_t OH, | |||
size_t OW, size_t oc_start, size_t oc_end, | |||
size_t oc_index, size_t unit_idx, | |||
size_t nr_units_in_tile, const DType& src_dtype, | |||
const DType& dst_dtype) { | |||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
Op op(src_dtype, dst_dtype); | |||
//! AT * m * A | |||
size_t OCB = (oc_end - oc_start) / pack_size; | |||
size_t oc = oc_start + oc_index; | |||
size_t ocb = oc_index / pack_size; | |||
#define cb(m, n) \ | |||
auto v##m##n = Vector<float, 4>::load( \ | |||
output_transform_buf + \ | |||
(m * alpha + n) * OCB * nr_units_in_tile * pack_size + \ | |||
ocb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||
#undef cb | |||
//! 1 1 1 0 v00 v01 v02 v03 1 0 | |||
//! 0 1 -1 1 v10 v11 v12 v13 1 1 | |||
//! v20 v21 v22 v23 1 -1 | |||
//! v30 v31 v32 v33 0 1 | |||
#define cb(m) \ | |||
auto t0##m = v0##m + v1##m + v2##m; \ | |||
auto t1##m = v1##m - v2##m + v3##m; | |||
UNROLL_CALL_NOWRAPPER(4, cb); | |||
#undef cb | |||
#define cb(m) \ | |||
v##m##0 = t##m##0 + t##m##1 + t##m##2; \ | |||
v##m##1 = t##m##1 - t##m##2 + t##m##3; | |||
UNROLL_CALL_NOWRAPPER(2, cb); | |||
#undef cb | |||
Vector<float, 4> vbias; | |||
if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
vbias = Vector<float, 4>::load(bias + oc); | |||
#define cb(m, n) v##m##n += vbias; | |||
UNROLL_CALL_RAW_D2(2, 2, cb); | |||
#undef cb | |||
} | |||
if (bmode != BiasMode::BIAS) { | |||
#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); | |||
UNROLL_CALL_RAW_D2(2, 2, cb); | |||
#undef cb | |||
} | |||
#define out_save(oho, owo) \ | |||
do { \ | |||
size_t oh = oh_start + oho; \ | |||
size_t ow = ow_start + owo; \ | |||
if (oh < OH && ow < OW) { \ | |||
if (bmode == BiasMode::BIAS) { \ | |||
v##oho##owo += Vector<float, 4>::load(bias + oc * OH * OW + \ | |||
oh * OW * pack_size + \ | |||
ow * pack_size); \ | |||
v##oho##owo = op(v##oho##owo.value); \ | |||
} \ | |||
v##oho##owo.save(output + oc * OH * OW + oh * OW * pack_size + \ | |||
ow * pack_size); \ | |||
} \ | |||
} while (0); | |||
UNROLL_CALL_RAW_D2(2, 2, out_save); | |||
#undef out_save | |||
} | |||
}; | |||
#undef CONCAT | |||
} // namespace | |||
namespace megdnn { | |||
namespace arm_common { | |||
namespace winograd { | |||
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_F23_mk4_f_nchw44) | |||
void winograd_F23_mk4_f_nchw44::filter(const float* filter, | |||
float* filter_transform_buf, | |||
float* transform_mid_buf, size_t OC, | |||
size_t IC, size_t oc_start, | |||
size_t oc_end) { | |||
//! 1 0 0 v00 v01 v02 1 0.5 0.5 0 | |||
//! 0.5 0.5 0.5 v10 v11 v12 0 0.5 -0.5 0 | |||
//! 0.5 -0.5 0.5 v20 v21 v22 0 0.5 0.5 1 | |||
//! 0 0 1 | |||
constexpr size_t pack_size = 4; | |||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
megdnn_assert((oc_end - oc_start) % pack_size == 0 && | |||
oc_start % pack_size == 0 && | |||
oc_end % pack_size == 0 && IC % pack_size == 0 && | |||
OC % pack_size == 0, | |||
"NCHW44 Winograd filter transform requires both OC and IC " | |||
"are times of 4"); | |||
size_t OCB = OC / pack_size; | |||
size_t ICB = IC / pack_size; | |||
for (size_t ocb = oc_start / pack_size; ocb < oc_end / pack_size; ocb++) { | |||
for (size_t icb = 0; icb < ICB; icb++) { | |||
for (size_t ic_inner = 0; ic_inner < pack_size; ic_inner++) { | |||
const float* fptr = filter + | |||
(ocb * ICB + icb) * KERNEL_SIZE * | |||
KERNEL_SIZE * pack_size * | |||
pack_size + | |||
ic_inner * pack_size; | |||
#define cb(m, n) \ | |||
Vector<float, 4> g##m##n = Vector<float, 4>::load( \ | |||
fptr + (m * KERNEL_SIZE + n) * pack_size * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||
#undef cb | |||
#define FILTER_TRANSFORM(n, wd, g) \ | |||
auto wd##n##0 = g##0##n; \ | |||
tmp0 = (g##0##n + g##2##n) * 0.5; \ | |||
tmp1 = g##1##n * 0.5; \ | |||
auto wd##n##1 = tmp0 + tmp1; \ | |||
auto wd##n##2 = tmp0 - tmp1; \ | |||
auto wd##n##3 = g##2##n; | |||
Vector<float, 4> tmp0, tmp1; | |||
UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||
UNROLL_CALL_RAW(4, FILTER_TRANSFORM, ret, wd); | |||
#undef FILTER_TRANSFORM | |||
#define cb_save(m, n) \ | |||
ret##m##n.save(filter_transform_buf + \ | |||
(m * ALPHA + n) * OCB * ICB * pack_size * pack_size + \ | |||
ocb * ICB * pack_size * pack_size + \ | |||
icb * pack_size * pack_size + ic_inner * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb_save) | |||
#undef cb_save | |||
} | |||
} | |||
} | |||
} | |||
void winograd_F23_mk4_f_nchw44::input(const float* input, | |||
float* input_transform_buf, | |||
float* transform_mid_buf, size_t IH, | |||
size_t IW, size_t IC, size_t PH, | |||
size_t PW, size_t unit_start_idx, | |||
size_t nr_units_in_tile) { | |||
megdnn_assert(IC % 4 == 0); | |||
constexpr int alpha = 3 + 2 - 1; | |||
// OW = IW + 2 * PW - KERNEL_SIZE + 1 | |||
auto units_w = | |||
div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||
float* patch = transform_mid_buf; | |||
float* patchT = transform_mid_buf + 4 * alpha * alpha; | |||
for (size_t ic = 0; ic < IC; ic += 4) { | |||
rep(unit_idx, nr_units_in_tile) { | |||
size_t index = unit_start_idx + unit_idx; | |||
size_t nh = index / units_w; | |||
size_t nw = index % units_w; | |||
int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||
int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||
if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||
iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||
InputTransformF23_NCHW44::prepare<true>(input, patch, patchT, | |||
ih_start, iw_start, IH, | |||
IW, ic, IC); | |||
InputTransformF23_NCHW44::transform(patchT, input_transform_buf, | |||
unit_idx, nr_units_in_tile, | |||
ic, IC); | |||
} else { | |||
InputTransformF23_NCHW44::prepare<false>(input, patch, patchT, | |||
ih_start, iw_start, IH, | |||
IW, ic, IC); | |||
InputTransformF23_NCHW44::transform(patchT, input_transform_buf, | |||
unit_idx, nr_units_in_tile, | |||
ic, IC); | |||
} | |||
} | |||
} | |||
} | |||
void winograd_F23_mk4_f_nchw44::output(const float* output_transform_buf, | |||
const float* bias, float* output, | |||
float* transform_mid_buf, BiasMode bmode, | |||
NonlineMode nonline_mode, size_t OH, | |||
size_t OW, size_t oc_start, | |||
size_t oc_end, size_t unit_start_idx, | |||
size_t nr_units_in_tile) { | |||
#define cb(_bmode, _nonline_op, ...) \ | |||
OutputTransformF23_NCHW44<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||
__VA_ARGS__); | |||
auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||
constexpr size_t pack_size = 4; | |||
size_t OC = oc_end - oc_start; | |||
megdnn_assert(OC % pack_size == 0 && oc_start % pack_size == 0 && | |||
oc_end % pack_size == 0, | |||
"NCHW44 Winograd filter transform requires OC is times of 4"); | |||
for (size_t oc = oc_start; oc < oc_end; oc += 4) { | |||
size_t oc_index = oc - oc_start; | |||
rep(unit_idx, nr_units_in_tile) { | |||
size_t index = unit_start_idx + unit_idx; | |||
auto nh = index / units_w; | |||
auto nw = index % units_w; | |||
size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||
size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||
DISPATCH_CONV_WINOGRAD_BIAS( | |||
megdnn_arm_common_winograd_nchw44_fp32_F23_mk4, cb, float, | |||
float, bmode, nonline_mode, output_transform_buf, bias, | |||
output, transform_mid_buf, oh_start, ow_start, OH, OW, | |||
oc_start, oc_end, oc_index, unit_idx, nr_units_in_tile, | |||
src_dtype, dst_dtype); | |||
} | |||
} | |||
#undef cb | |||
} | |||
} // namespace winograd | |||
} // namespace arm_common | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -0,0 +1,410 @@ | |||
/** | |||
* \file dnn/src/arm_common/conv_bias/fp32/strategy_f | |||
* 63_mk4_nchw44.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2020 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
*/ | |||
#include "src/arm_common/conv_bias/fp32/filter_transform.h" | |||
#include "src/arm_common/conv_bias/fp32/helper.h" | |||
#include "src/arm_common/conv_bias/fp32/strategy.h" | |||
#include "src/arm_common/elemwise_helper/op_unary.h" | |||
#include "src/arm_common/simd_macro/marm_neon.h" | |||
#include "src/arm_common/utils.h" | |||
#include "src/common/unroll_macro.h" | |||
#include "src/common/utils.h" | |||
#include "src/common/winograd/winograd_helper.h" | |||
#include "src/fallback/conv_bias/winograd/winograd.h" | |||
#include "midout.h" | |||
MIDOUT_DECL(megdnn_arm_common_winograd_fp32_F63_mk4) | |||
using namespace megdnn; | |||
using namespace arm_common; | |||
namespace { | |||
constexpr size_t alpha = 6 + 3 - 1; | |||
constexpr size_t pack_size = 4; | |||
struct InputTransformF63_NCHW44 { | |||
template <bool inner> | |||
static void prepare(const float* input, float* patch, float* patchT, | |||
int ih_start, int iw_start, size_t IH, size_t IW, | |||
size_t ic, size_t IC) { | |||
MEGDNN_MARK_USED_VAR(patch); | |||
size_t IW4 = IW * pack_size; | |||
size_t iw4_start = iw_start * pack_size; | |||
size_t icb = ic / pack_size; | |||
if (!(inner && ic + pack_size < IC)) { | |||
memset(patchT, 0, sizeof(float) * pack_size * alpha * alpha); | |||
} | |||
if (inner) { | |||
const float* input_ptr = | |||
input + icb * IH * IW4 + ih_start * IW4 + iw4_start; | |||
for (size_t ih = 0; ih < alpha; ih++) { | |||
#define cb(i) auto v##i = vld1q_f32(input_ptr + pack_size * i); | |||
UNROLL_CALL_NOWRAPPER(8, cb); | |||
#undef cb | |||
#define cb(i) vst1q_f32(patchT + ih * pack_size * alpha + i * pack_size, v##i); | |||
UNROLL_CALL_NOWRAPPER(8, cb); | |||
#undef cb | |||
input_ptr += IW4; | |||
} | |||
} else { | |||
int ih0_act = std::max<int>(ih_start, 0), | |||
ih1_act = std::min<int>(ih_start + alpha, IH), | |||
iw0_act = std::max<int>(iw_start, 0), | |||
iw1_act = std::min<int>(iw_start + alpha, IW); | |||
const float* input_ptr = input + icb * IH * IW4; | |||
// partial copy | |||
for (int ih = ih0_act; ih < ih1_act; ++ih) { | |||
for (int iw = iw0_act; iw < iw1_act; ++iw) { | |||
size_t iho = ih - ih_start, iwo = iw - iw_start; | |||
auto src = vld1q_f32(input_ptr + ih * IW4 + iw * pack_size); | |||
vst1q_f32( | |||
patchT + iho * pack_size * alpha + iwo * pack_size, | |||
src); | |||
} | |||
} | |||
} | |||
} | |||
static void transform(const float* patchT, float* input_transform_buf, | |||
size_t unit_idx, size_t nr_units_in_tile, size_t ic, | |||
size_t IC) { | |||
// BT * d * B | |||
#define cb(m, n) \ | |||
Vector<float, 4> d##m##n = Vector<float, 4>::load( \ | |||
patchT + m * alpha * pack_size + n * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); | |||
#undef cb | |||
//! B | |||
//! 1 0 0 0 0 0 0 0 | |||
//! 0 1 -1 0.5 -0.5 2 -2 -1 | |||
//! -5.25 1 1 0.25 0.25 4 4 0 | |||
//! 0 -4.25 4.25 -2.5 2.5 -2.5 2.5 5.25 | |||
//! 5.25 -4.25 -4.25 -1.25 -1.25 -5 -5 0 | |||
//! 0 1 -1 2 -2 0.5 -0.5 -5.25 | |||
//! -1 1 1 1 1 1 1 0 | |||
//! 0 0 0 0 0 0 0 1 | |||
#define cb(m) \ | |||
auto t0##m = d0##m + (d4##m - d2##m) * 5.25f - d6##m; \ | |||
auto t1##m = d1##m + d2##m + d5##m + d6##m - (d3##m + d4##m) * 4.25f; \ | |||
auto t2##m = d2##m + d6##m - (d1##m + d5##m) + (d3##m - d4##m) * 4.25f; \ | |||
auto t3##m = d1##m * 0.5f + d2##m * 0.25f - d3##m * 2.5f - d4##m * 1.25f + \ | |||
d5##m * 2.f + d6##m; \ | |||
auto t4##m = d1##m * (-0.5f) + d2##m * 0.25f + d3##m * 2.5f - \ | |||
d4##m * 1.25f - d5##m * 2.f + d6##m; \ | |||
auto t5##m = d1##m * 2.f + d2##m * 4.f - d3##m * 2.5f - d4##m * 5.f + \ | |||
d5##m * 0.5f + d6##m; \ | |||
auto t6##m = d1##m * (-2.f) + d2##m * 4.f + d3##m * 2.5f - d4##m * 5.f - \ | |||
d5##m * 0.5f + d6##m; \ | |||
auto t7##m = (d7##m - d1##m) + (d3##m - d5##m) * 5.25f; | |||
UNROLL_CALL_NOWRAPPER(8, cb); | |||
#undef cb | |||
#define cb(m) \ | |||
d##m##0 = t##m##0 + (t##m##4 - t##m##2) * 5.25f - t##m##6; \ | |||
d##m##1 = t##m##1 + t##m##2 + t##m##5 + t##m##6 - \ | |||
(t##m##3 + t##m##4) * 4.25f; \ | |||
d##m##2 = t##m##2 + t##m##6 - (t##m##1 + t##m##5) + \ | |||
(t##m##3 - t##m##4) * 4.25f; \ | |||
d##m##3 = t##m##1 * 0.5f + t##m##2 * 0.25f - t##m##3 * 2.5f - \ | |||
t##m##4 * 1.25f + t##m##5 * 2.f + t##m##6; \ | |||
d##m##4 = t##m##1 * (-0.5f) + t##m##2 * 0.25f + t##m##3 * 2.5f - \ | |||
t##m##4 * 1.25f - t##m##5 * 2.f + t##m##6; \ | |||
d##m##5 = t##m##1 * 2.f + t##m##2 * 4.f - t##m##3 * 2.5f - t##m##4 * 5.f + \ | |||
t##m##5 * 0.5f + t##m##6; \ | |||
d##m##6 = t##m##1 * (-2.f) + t##m##2 * 4.f + t##m##3 * 2.5f - \ | |||
t##m##4 * 5.f - t##m##5 * 0.5f + t##m##6; \ | |||
d##m##7 = (t##m##7 - t##m##1) + (t##m##3 - t##m##5) * 5.25f; | |||
UNROLL_CALL_NOWRAPPER(8, cb); | |||
#undef cb | |||
size_t ICB = IC / pack_size; | |||
size_t icb = ic / pack_size; | |||
#define cb(m, n) \ | |||
d##m##n.save(input_transform_buf + \ | |||
(m * alpha + n) * ICB * nr_units_in_tile * pack_size + \ | |||
icb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb) | |||
#undef cb | |||
} | |||
}; | |||
template <BiasMode bmode, typename Op> | |||
struct OutputTransformF63_NCHW44 { | |||
static void transform(const float* output_transform_buf, const float* bias, | |||
float* output, float* transform_mid_buf, | |||
size_t oh_start, size_t ow_start, size_t OH, | |||
size_t OW, size_t oc_start, size_t oc_end, | |||
size_t oc_index, size_t unit_idx, | |||
size_t nr_units_in_tile, const DType& src_dtype, | |||
const DType& dst_dtype) { | |||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
Op op(src_dtype, dst_dtype); | |||
//! AT * m * A | |||
size_t oc = oc_start + oc_index; | |||
size_t OCB = (oc_end - oc_start) / pack_size; | |||
size_t ocb = oc_index / pack_size; | |||
#define cb(m, n) \ | |||
auto v##m##n = Vector<float, 4>::load( \ | |||
output_transform_buf + \ | |||
(m * alpha + n) * OCB * nr_units_in_tile * pack_size + \ | |||
ocb * nr_units_in_tile * pack_size + unit_idx * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb); | |||
#undef cb | |||
/** | |||
* A | |||
* | |||
* 1 0 0 0 0 0 | |||
* 1 1 1 1 1 1 | |||
* 1 -1 1 -1 1 -1 | |||
* 1 2 4 8 16 32 | |||
* 1 -2 4 -8 16 -32 | |||
* 1 0.5 0.25 0.125 0.0625 0.03125 | |||
* 1 -0.5 0.25 -0.125 0.0625 -0.03125 | |||
* 0 0.0 0 0 0 1 | |||
*/ | |||
Vector<float, 4> v1addv2, v1subv2, v3addv4, v3subv4, v5addv6, v5subv6; | |||
#define cb(m) \ | |||
v1addv2 = v1##m + v2##m; \ | |||
v1subv2 = v1##m - v2##m; \ | |||
v3addv4 = v3##m + v4##m; \ | |||
v3subv4 = v3##m - v4##m; \ | |||
v5addv6 = v5##m + v6##m; \ | |||
v5subv6 = v5##m - v6##m; \ | |||
auto t0##m = v0##m + v1addv2 + v3addv4 + v5addv6; \ | |||
auto t1##m = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; \ | |||
auto t2##m = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; \ | |||
auto t3##m = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; \ | |||
auto t4##m = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; \ | |||
auto t5##m = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + v7##m; | |||
UNROLL_CALL_NOWRAPPER(8, cb); | |||
#undef cb | |||
#define cb(m) \ | |||
v1addv2 = t##m##1 + t##m##2; \ | |||
v1subv2 = t##m##1 - t##m##2; \ | |||
v3addv4 = t##m##3 + t##m##4; \ | |||
v3subv4 = t##m##3 - t##m##4; \ | |||
v5addv6 = t##m##5 + t##m##6; \ | |||
v5subv6 = t##m##5 - t##m##6; \ | |||
v##m##0 = t##m##0 + v1addv2 + v3addv4 + v5addv6; \ | |||
v##m##1 = v1subv2 + v3subv4 * 2.f + v5subv6 * 0.5f; \ | |||
v##m##2 = v1addv2 + v3addv4 * 4.f + v5addv6 * 0.25f; \ | |||
v##m##3 = v1subv2 + v3subv4 * 8.f + v5subv6 * 0.125f; \ | |||
v##m##4 = v1addv2 + v3addv4 * 16.f + v5addv6 * 0.0625f; \ | |||
v##m##5 = v1subv2 + v3subv4 * 32.f + v5subv6 * 0.03125f + t##m##7; | |||
UNROLL_CALL_NOWRAPPER(6, cb); | |||
#undef cb | |||
Vector<float, 4> vbias; | |||
if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||
vbias = Vector<float, 4>::load(bias + oc); | |||
#define cb(m, n) v##m##n += vbias; | |||
UNROLL_CALL_RAW_D2(6, 6, cb); | |||
#undef cb | |||
} | |||
if (bmode != BiasMode::BIAS) { | |||
#define cb(m, n) v##m##n = op(CONCAT(v##m, n).value); | |||
UNROLL_CALL_RAW_D2(6, 6, cb); | |||
#undef cb | |||
} | |||
#define out_save(oho, owo) \ | |||
do { \ | |||
size_t oh = oh_start + oho; \ | |||
size_t ow = ow_start + owo; \ | |||
if (oh < OH && ow < OW) { \ | |||
if (bmode == BiasMode::BIAS) { \ | |||
v##oho##owo += Vector<float, 4>::load(bias + oc * OH * OW + \ | |||
oh * OW * pack_size + \ | |||
ow * pack_size); \ | |||
v##oho##owo = op(v##oho##owo.value); \ | |||
} \ | |||
v##oho##owo.save(output + oc * OH * OW + oh * OW * pack_size + \ | |||
ow * pack_size); \ | |||
} \ | |||
} while (0); | |||
UNROLL_CALL_RAW_D2(6, 6, out_save); | |||
} | |||
#undef out_save | |||
}; | |||
} // namespace | |||
namespace megdnn { | |||
namespace arm_common { | |||
namespace winograd { | |||
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_F63_mk4_f_nchw44) | |||
void winograd_F63_mk4_f_nchw44::filter(const float* filter, | |||
float* filter_transform_buf, | |||
float* transform_mid_buf, size_t OC, | |||
size_t IC, size_t oc_start, | |||
size_t oc_end) { | |||
constexpr size_t pack_size = 4; | |||
// Gg * GT | |||
// G | |||
// 1.0000000 0.0000000 0.0000000 | |||
// -0.2222222 -0.2222222 -0.2222222 | |||
// -0.2222222 0.2222222 -0.2222222 | |||
// 0.0111111 0.0222222 0.0444444 | |||
// 0.0111111 -0.0222222 0.0444444 | |||
// 0.7111111 0.3555556 0.1777778 | |||
// 0.7111111 -0.3555556 0.1777778 | |||
// 0.0000000 0.0000000 1.0000000 | |||
MEGDNN_MARK_USED_VAR(transform_mid_buf); | |||
megdnn_assert((oc_end - oc_start) % pack_size == 0 && | |||
oc_start % pack_size == 0 && | |||
oc_end % pack_size == 0 && IC % pack_size == 0 && | |||
OC % pack_size == 0, | |||
"NCHW44 Winograd filter transform requires both OC and IC " | |||
"are times of 4"); | |||
size_t ICB = IC / pack_size; | |||
for (size_t ocb = oc_start / pack_size; ocb < oc_end / pack_size; ocb++) { | |||
for (size_t icb = 0; icb < ICB; icb++) { | |||
for (size_t ic_inner = 0; ic_inner < pack_size; ic_inner++) { | |||
const float* fptr = filter + | |||
(ocb * ICB + icb) * KERNEL_SIZE * | |||
KERNEL_SIZE * pack_size * | |||
pack_size + | |||
ic_inner * pack_size; | |||
#define cb(m, n) \ | |||
Vector<float, 4> g##m##n = Vector<float, 4>::load( \ | |||
fptr + (m * KERNEL_SIZE + n) * pack_size * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(3, 3, cb) | |||
#undef cb | |||
#define FILTER_TRANSFORM(n, wd, g) \ | |||
auto wd##n##0 = g##0##n; \ | |||
tmp0 = (g##0##n + g##2##n) * -0.2222222f; \ | |||
tmp1 = g##1##n * -0.2222222f; \ | |||
auto wd##n##1 = tmp0 + tmp1; \ | |||
auto wd##n##2 = tmp0 - tmp1; \ | |||
tmp0 = g##0##n * 0.0111111f + g##2##n * 0.0444444f; \ | |||
tmp1 = g##1##n * 0.0222222f; \ | |||
auto wd##n##3 = tmp0 + tmp1; \ | |||
auto wd##n##4 = tmp0 - tmp1; \ | |||
tmp0 = g##0##n * 0.7111111f + g##2##n * 0.1777778f; \ | |||
tmp1 = g##1##n * 0.3555556f; \ | |||
auto wd##n##5 = tmp0 + tmp1; \ | |||
auto wd##n##6 = tmp0 - tmp1; \ | |||
auto wd##n##7 = g##2##n; | |||
Vector<float, 4> tmp0, tmp1; | |||
UNROLL_CALL_RAW(3, FILTER_TRANSFORM, wd, g); | |||
UNROLL_CALL_RAW(8, FILTER_TRANSFORM, ret, wd); | |||
#undef FILTER_TRANSFORM | |||
#define cb_save(m, n) \ | |||
ret##m##n.save(filter_transform_buf + (m * alpha + n) * OC * IC + \ | |||
ocb * IC * pack_size + icb * pack_size * pack_size + \ | |||
ic_inner * pack_size); | |||
UNROLL_CALL_NOWRAPPER_D2(8, 8, cb_save) | |||
#undef cb_save | |||
} | |||
} | |||
} | |||
} | |||
void winograd_F63_mk4_f_nchw44::input(const float* input, | |||
float* input_transform_buf, | |||
float* transform_mid_buf, size_t IH, | |||
size_t IW, size_t IC, size_t PH, | |||
size_t PW, size_t unit_start_idx, | |||
size_t nr_units_in_tile) { | |||
constexpr size_t pack_size = 4; | |||
megdnn_assert(IC % pack_size == 0); | |||
constexpr int alpha = 3 + 6 - 1; | |||
// OW = IW + 2 * PW - KERNEL_SIZE + 1 | |||
auto units_w = | |||
div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||
float* patch = transform_mid_buf; | |||
float* patchT = transform_mid_buf + pack_size * alpha * alpha; | |||
for (size_t ic = 0; ic < IC; ic += pack_size) { | |||
rep(unit_idx, nr_units_in_tile) { | |||
size_t index = unit_start_idx + unit_idx; | |||
size_t nh = index / units_w; | |||
size_t nw = index % units_w; | |||
int ih_start = nh * OUTPUT_BLOCK_SIZE - PH; | |||
int iw_start = nw * OUTPUT_BLOCK_SIZE - PW; | |||
if (ih_start >= 0 && ih_start + alpha <= static_cast<int>(IH) && | |||
iw_start >= 0 && iw_start + alpha <= static_cast<int>(IW)) { | |||
InputTransformF63_NCHW44::prepare<true>(input, patch, patchT, | |||
ih_start, iw_start, IH, | |||
IW, ic, IC); | |||
InputTransformF63_NCHW44::transform(patchT, input_transform_buf, | |||
unit_idx, nr_units_in_tile, | |||
ic, IC); | |||
} else { | |||
InputTransformF63_NCHW44::prepare<false>(input, patch, patchT, | |||
ih_start, iw_start, IH, | |||
IW, ic, IC); | |||
InputTransformF63_NCHW44::transform(patchT, input_transform_buf, | |||
unit_idx, nr_units_in_tile, | |||
ic, IC); | |||
} | |||
} | |||
} | |||
} | |||
void winograd_F63_mk4_f_nchw44::output(const float* output_transform_buf, | |||
const float* bias, float* output, | |||
float* transform_mid_buf, BiasMode bmode, | |||
NonlineMode nonline_mode, size_t OH, | |||
size_t OW, size_t oc_start, | |||
size_t oc_end, size_t unit_start_idx, | |||
size_t nr_units_in_tile) { | |||
constexpr size_t pack_size = 4; | |||
#define cb(_bmode, _nonline_op, ...) \ | |||
OutputTransformF63_NCHW44<_bmode MEGDNN_COMMA _nonline_op>::transform( \ | |||
__VA_ARGS__); | |||
auto units_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||
for (size_t oc = oc_start; oc < oc_end; oc += pack_size) { | |||
size_t oc_index = oc - oc_start; | |||
rep(unit_idx, nr_units_in_tile) { | |||
size_t index = unit_start_idx + unit_idx; | |||
auto nh = index / units_w; | |||
auto nw = index % units_w; | |||
size_t oh_start = nh * OUTPUT_BLOCK_SIZE; | |||
size_t ow_start = nw * OUTPUT_BLOCK_SIZE; | |||
DISPATCH_CONV_WINOGRAD_BIAS( | |||
megdnn_arm_common_winograd_fp32_F63_mk4, cb, float, float, | |||
bmode, nonline_mode, output_transform_buf, bias, output, | |||
transform_mid_buf, oh_start, ow_start, OH, OW, oc_start, | |||
oc_end, oc_index, unit_idx, nr_units_in_tile, src_dtype, | |||
dst_dtype); | |||
} | |||
} | |||
#undef cb | |||
} | |||
} // namespace winograd | |||
} // namespace arm_common | |||
} // namespace megdnn | |||
// vim: syntax=cpp.doxygen |
@@ -169,6 +169,14 @@ public: | |||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
tile_size)); | |||
winograd_algos.emplace_back(refhold.back().get()); | |||
refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44( | |||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
tile_size)); | |||
winograd_algos.emplace_back(refhold.back().get()); | |||
refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44( | |||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
tile_size)); | |||
winograd_algos.emplace_back(refhold.back().get()); | |||
#if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC | |||
refhold.emplace_back(new AlgoFP16WinogradF23( | |||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||
@@ -49,6 +49,9 @@ private: | |||
class AlgoFP32WinogradF54; | |||
class AlgoFP32WinogradF45; | |||
class AlgoFP32WinogradF23_4x4_NCHW44; | |||
class AlgoFP32WinogradF63_4x4_NCHW44; | |||
class AlgoS8ChanWiseStride1NCHW44; | |||
class AlgoS8ChanWiseStride2NCHW44; | |||
@@ -28,13 +28,22 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
using namespace winograd; | |||
check_exec(src.layout, dst.layout, workspace.size); | |||
//! NCHW44 group conv or NCHW group conv or both dense conv | |||
size_t flt_start = 0; | |||
size_t pack_c_size = 1; | |||
size_t group = 1; | |||
if (src.layout.ndim == 5) { | |||
if (src.layout.ndim == 5) { //! {g, OC, IC, FH, FW} | |||
flt_start = 1; | |||
group = src.layout[0]; | |||
} else if (src.layout.ndim == 6) { //! {OC/4, IC/4, FH, FW, 4, 4} | |||
pack_c_size = src.layout[5]; | |||
} else if (src.layout.ndim == 7) { //! {g, OC/4, IC/4, FH, FW, 4, 4} | |||
flt_start = 1; | |||
group = src.layout[0]; | |||
pack_c_size = src.layout[6]; | |||
} | |||
size_t OC = src.layout[flt_start], IC = src.layout[flt_start + 1], | |||
size_t OC = src.layout[flt_start] * pack_c_size, | |||
IC = src.layout[flt_start + 1] * pack_c_size, | |||
FW = src.layout[flt_start + 3]; | |||
size_t m = param().output_block_size; | |||
@@ -68,13 +77,23 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
float* workspace_ptr = workspace.ptr<float>(); | |||
if (FW == 3) { | |||
if (m == 2) { | |||
DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, 0, | |||
0); | |||
if (pack_c_size == 1) { | |||
DISPATCH(winograd_2x3_4x4_f, param::Winograd::Format::MK4, | |||
0, 0); | |||
} else if (pack_c_size == 4) { | |||
DISPATCH(winograd_F23_mk4_f_nchw44, | |||
param::Winograd::Format::MK4, 0, 5); | |||
} | |||
} else if (m == 6) { | |||
DISPATCH(winograd_6x3_1x1_f, param::Winograd::Format::DEFAULT, | |||
0, 1); | |||
DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, 0, | |||
2); | |||
if (pack_c_size == 1) { | |||
DISPATCH(winograd_6x3_4x4_f, param::Winograd::Format::MK4, | |||
0, 2); | |||
} else if (pack_c_size == 4) { | |||
DISPATCH(winograd_F63_mk4_f_nchw44, | |||
param::Winograd::Format::MK4, 0, 6); | |||
} | |||
} | |||
} else if (FW == 4) { | |||
if (m == 5) { | |||
@@ -158,7 +158,10 @@ ConvBiasForward::CanonizedFilterMeta ConvBiasForward::check_exec( | |||
} | |||
template <typename T> | |||
struct ParamTrait; | |||
struct NCHWParamTrait; | |||
template <typename T> | |||
struct NCHW44ParamTrait; | |||
std::string ConvBias::WinogradParam::to_string() const { | |||
return ssprintf("%u:%u:%u", channel_block_size, output_block_size, | |||
@@ -166,32 +169,51 @@ std::string ConvBias::WinogradParam::to_string() const { | |||
} | |||
template <typename T> | |||
std::string ConvBias::algo_name(const std::string& base, const T& p) { | |||
return ssprintf("%s:%s:%s", ParamTrait<T>::category.c_str(), base.c_str(), | |||
p.to_string().c_str()); | |||
std::string ConvBias::algo_name(const std::string& base, const T& p, | |||
param::ConvBias::Format format) { | |||
if (format == param::ConvBias::Format::NCHW) { | |||
return ssprintf("%s:%s:%s", NCHWParamTrait<T>::category.c_str(), | |||
base.c_str(), p.to_string().c_str()); | |||
} else if (format == param::ConvBias::Format::NCHW44) { | |||
return ssprintf("%s:%s:%s", NCHW44ParamTrait<T>::category.c_str(), | |||
base.c_str(), p.to_string().c_str()); | |||
} | |||
megdnn_throw("Invalid format"); | |||
return ""; | |||
} | |||
#define FOREACH_CONV_BIAS_PARAM(cb) \ | |||
cb(WinogradParam) cb(DirectParam) cb(MatmulParam) cb(DefaultParam) | |||
#define cb(pt) \ | |||
template <> \ | |||
struct ParamTrait<ConvBias::pt> { \ | |||
static const std::string category; \ | |||
#define cb(pt) \ | |||
template <> \ | |||
struct NCHWParamTrait<ConvBias::pt> { \ | |||
static const std::string category; \ | |||
}; \ | |||
template <> \ | |||
struct NCHW44ParamTrait<ConvBias::pt> { \ | |||
static const std::string category; \ | |||
}; | |||
FOREACH_CONV_BIAS_PARAM(cb) | |||
#undef cb | |||
#define cb(pt, ct) const std::string ParamTrait<ConvBias::pt>::category = ct | |||
cb(WinogradParam, "WINOGRAD"); | |||
#define cb(pt, ct) \ | |||
const std::string NCHWParamTrait<ConvBias::pt>::category = ct; \ | |||
const std::string NCHW44ParamTrait<ConvBias::pt>::category = ct | |||
cb(DirectParam, "DIRECT"); | |||
cb(MatmulParam, "MATMUL"); | |||
cb(DefaultParam, "DEFAULT"); | |||
#undef cb | |||
const std::string NCHWParamTrait<ConvBias::WinogradParam>::category = | |||
"WINOGRAD"; | |||
const std::string NCHW44ParamTrait<ConvBias::WinogradParam>::category = | |||
"WINOGRAD_NCHW44"; | |||
#define cb(t) \ | |||
template std::string ConvBias::algo_name<ConvBias::t>( \ | |||
const std::string& base, const ConvBias::t& p); | |||
const std::string& base, const ConvBias::t& p, \ | |||
param::ConvBias::Format format); | |||
FOREACH_CONV_BIAS_PARAM(cb) | |||
#undef cb | |||
@@ -199,17 +221,37 @@ ConvBias::WinogradParam ConvBias::parse_winograd_name( | |||
const std::string& algo_name) { | |||
ConvBias::WinogradParam ret = INVALID_WINOGRAD_PARAM; | |||
char base[128]; | |||
sscanf(algo_name.c_str(), "WINOGRAD:%[^:]:%u:%u:%u", base, | |||
&(ret.channel_block_size), &(ret.output_block_size), | |||
&(ret.tile_size)); | |||
if (ret.tile_size == 0 || ret.output_block_size == 0 || | |||
ret.channel_block_size == 0) { | |||
megdnn_log_warn("the algo name %s is not suitable for winograd", | |||
algo_name.c_str()); | |||
return INVALID_WINOGRAD_PARAM; | |||
char name[128]; | |||
auto parse = [&](const std::string& algo_name, | |||
const std::string& pre) -> auto { | |||
memset(name, 0, 128); | |||
sscanf(algo_name.c_str(), "%[^:]:%[^:]:%u:%u:%u", name, base, | |||
&(ret.channel_block_size), &(ret.output_block_size), | |||
&(ret.tile_size)); | |||
if (strcmp(name, pre.c_str())) { | |||
megdnn_log_warn("algo %s is not %s algo", name, pre.c_str()); | |||
ret = INVALID_WINOGRAD_PARAM; | |||
return false; | |||
} | |||
if (ret.tile_size == 0 || ret.output_block_size == 0 || | |||
ret.channel_block_size == 0) { | |||
megdnn_log_warn("the algo name %s is not suitable for %s", | |||
algo_name.c_str(), pre.c_str()); | |||
ret = INVALID_WINOGRAD_PARAM; | |||
return false; | |||
} | |||
return true; | |||
}; | |||
if (parse(algo_name, "WINOGRAD_NCHW44")) { | |||
return ret; | |||
} else { | |||
parse(algo_name, "WINOGRAD"); | |||
return ret; | |||
} | |||
return ret; | |||
} | |||
constexpr ConvBias::WinogradParam ConvBias::INVALID_WINOGRAD_PARAM; | |||
void handle_bias_and_nonlinear(Handle* handle, param::ConvBias args, | |||
@@ -299,6 +299,7 @@ void make_canonized_filter_meta_nchwxx( | |||
megdnn_assert(param.format == Param::Format::NCHW88 || | |||
param.format == Param::Format::NCHW44 || | |||
param.format == Param::Format::NCHW44_WINOGRAD || | |||
param.format == Param::Format::NCHW88_WINOGRAD); | |||
size_t img_ndim = 2; | |||
size_t flt_start = 0; | |||
@@ -663,6 +664,7 @@ ConvolutionBase<Parameter>::deduce_layout_fwd(const TensorLayout& src, | |||
param().format == Param::Format::NCHW32 || | |||
param().format == Param::Format::NCHW88 || | |||
param().format == Param::Format::NCHW88_WINOGRAD || | |||
param().format == Param::Format::NCHW44_WINOGRAD || | |||
param().format == Param::Format::CHWN4); | |||
img_dim = src.ndim - 3; | |||
if ((param().format == Param::Format::NCHW88 || | |||
@@ -68,6 +68,7 @@ constexpr size_t layout_pack_size(param::ConvBias::Format layout) { | |||
case param::ConvBias::Format::NHWCD4: | |||
return 4; | |||
case param::ConvBias::Format::NCHW4: | |||
case param::ConvBias::Format::NCHW44: | |||
return 4; | |||
case param::ConvBias::Format::NCHW32: | |||
return 32; | |||
@@ -363,6 +364,7 @@ INST(uint8_t, uint8_t, int16_t, int) | |||
_ctype, _dst_type, _input_filter_compute_type, \ | |||
_output_compute_type, layout, param::MatrixMul::Format::MK4>; | |||
INST(float, float, float, float, param::ConvBias::Format::NCHW) | |||
INST(float, float, float, float, param::ConvBias::Format::NCHW44) | |||
#undef INST | |||
#define INST(_ctype, _dst_type, _input_filter_compute_type, \ | |||
@@ -425,6 +425,7 @@ const T* ConvBiasImpl::NCBKernParam::filter(size_t group_pack_id, | |||
break; | |||
} | |||
case ConvBiasImpl::Param::Format::NCHW_WINOGRAD: | |||
case ConvBiasImpl::Param::Format::NCHW44_WINOGRAD: | |||
case ConvBiasImpl::Param::Format::NCHW88_WINOGRAD: { | |||
//! four format of weight layout | |||
//! 1. {g, alpha, alpha, ocpg/8, icpg/8, 8, 8} | |||
@@ -198,7 +198,8 @@ void ConvolutionImpl::exec_with_ncb_kern(const NCBKernParam& param, | |||
for (auto kernel : kerns) { | |||
megdnn_assert(param.filter_meta.format == Param::Format::NCHW || | |||
param.filter_meta.format == Param::Format::NHWC || | |||
param.filter_meta.format == Param::Format::NCHW88, | |||
param.filter_meta.format == Param::Format::NCHW88 || | |||
param.filter_meta.format == Param::Format::NCHW44, | |||
"invalid conv format"); | |||
auto run = [param, kernel](size_t index, size_t thread_id) { | |||
CpuNDRange ndrange_id(kernel.global_size, index); | |||
@@ -137,6 +137,7 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
#undef DISPATCH_FORMAT_MK8 | |||
#undef DISPATCH_DTYPE | |||
} else { | |||
megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||
#define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
_output_compute_type, _format, rescale) \ | |||
if (param().format == _format) { \ | |||
@@ -158,20 +159,58 @@ void WinogradFilterPreprocessImpl::exec(_megdnn_tensor_in src, | |||
DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
DISPATCH_FORMAT_MK8, 1.0f, _midout_tag, 0); \ | |||
} | |||
megdnn_assert(src.layout.ndim == 6 || src.layout.ndim == 7); | |||
if (FW == 3) { | |||
if (m == 2) { | |||
std::vector<float> interp_points = {0, 1, -1}; | |||
DISPATCH_DTYPE(4); | |||
} else if (m == 6) { | |||
std::vector<float> interp_points = {0, 1, -1, 2, -2, 0.5, -0.5}; | |||
DISPATCH_DTYPE(5); | |||
if (pack_c_size == 8) { //! NCHW88 | |||
if (FW == 3) { | |||
if (m == 2) { | |||
std::vector<float> interp_points = {0, 1, -1}; | |||
DISPATCH_DTYPE(4); | |||
} else if (m == 6) { | |||
std::vector<float> interp_points = {0, 1, -1, 2, | |||
-2, 0.5, -0.5}; | |||
DISPATCH_DTYPE(5); | |||
} | |||
} | |||
#undef cb | |||
#undef DISPATCH_FORMAT_MK8 | |||
#undef DISPATCH_DTYPE | |||
} | |||
else if (pack_c_size == 4) { //! NCHW44 | |||
#define cb(_ctype, _dst_type, _input_filter_compute_type, \ | |||
_output_compute_type, _format, rescale) \ | |||
if (param().format == _format) { \ | |||
return winograd::StrategyHelper< \ | |||
_ctype, _dst_type, _input_filter_compute_type, \ | |||
_output_compute_type, param::ConvBias::Format::NCHW44, \ | |||
_format>::filter(src_ptr, dst_ptr, workspace_ptr, OC, IC, 0, \ | |||
OC, m, FW, interp_points, src.layout.dtype, \ | |||
rescale); \ | |||
} | |||
#define DISPATCH_FORMAT_MK4(_ctype, _dst_type, _input_filter_compute_type, \ | |||
_output_compute_type, _rescale) \ | |||
cb(_ctype, _dst_type, _input_filter_compute_type, _output_compute_type, \ | |||
param::Winograd::Format::MK4, _rescale); | |||
#define DISPATCH_DTYPE(_midout_tag) \ | |||
if (src.layout.dtype.enumv() == DTypeEnum::Float32) { \ | |||
DISPATCH_KERNEL(dt_float32, dt_float32, dt_float32, dt_float32, \ | |||
DISPATCH_FORMAT_MK4, 1.0f, _midout_tag, 0); \ | |||
} | |||
if (FW == 3) { | |||
if (m == 2) { | |||
std::vector<float> interp_points = {0, 1, -1}; | |||
DISPATCH_DTYPE(6); | |||
} else if (m == 6) { | |||
std::vector<float> interp_points = {0, 1, -1, 2, | |||
-2, 0.5, -0.5}; | |||
DISPATCH_DTYPE(7); | |||
} | |||
} | |||
#undef cb | |||
#undef DISPATCH_FORMAT_MK8 | |||
#undef DISPATCH_KERNEL | |||
#undef DISPATCH_DTYPE | |||
} | |||
} | |||
megdnn_assert(execed, | |||
"Unsupport winograd filter preprocess. m: %zu src: %s", m, | |||
@@ -699,6 +699,135 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F16_F23_8x8) { | |||
} | |||
#endif | |||
void benchmark_winograd_nchw_vs_nchw44(const char* algo_name, Handle* handle) { | |||
using namespace conv_bias; | |||
using NLMode = param::ConvBias::NonlineMode; | |||
std::vector<conv_bias::TestArg> args_nchw44; | |||
std::vector<conv_bias::TestArg> args_nchw; | |||
auto pack = [&](size_t n, size_t oc, size_t ic, size_t h, size_t w, | |||
size_t group, NLMode nlmode) { | |||
param::ConvBias param; | |||
param.format = param::ConvBias::Format::NCHW44; | |||
param.stride_h = 1; | |||
param.stride_w = 1; | |||
param.pad_h = 1; | |||
param.pad_w = 1; | |||
param.nonlineMode = nlmode; | |||
if (group == 1) { | |||
param.sparse = param::ConvBias::Sparse::DENSE; | |||
args_nchw44.emplace_back(param, TensorShape{n, ic / 4, h, w, 4}, | |||
TensorShape{oc / 4, ic / 4, 3, 3, 4, 4}, | |||
TensorShape{}); | |||
param.format = param::ConvBias::Format::NCHW; | |||
args_nchw.emplace_back(param, TensorShape{n, ic, h, w}, | |||
TensorShape{oc, ic, 3, 3}, TensorShape{}); | |||
} else { | |||
auto oc_per_group = oc / group; | |||
auto ic_per_group = ic / group; | |||
param.sparse = param::ConvBias::Sparse::GROUP; | |||
args_nchw44.emplace_back(param, | |||
TensorShape{n, ic_per_group / 4, h, w, 4}, | |||
TensorShape{group, oc_per_group / 4, | |||
ic_per_group / 4, 3, 3, 4, 4}, | |||
TensorShape{}); | |||
param.format = param::ConvBias::Format::NCHW; | |||
args_nchw.emplace_back( | |||
param, TensorShape{n, ic, h, w}, | |||
TensorShape{group, oc_per_group, ic_per_group, 3, 3}, | |||
TensorShape{}); | |||
} | |||
}; | |||
std::vector<NLMode> nonlinemode = {NLMode::IDENTITY}; | |||
for (auto nlmode : nonlinemode) | |||
for (size_t n : {1, 2}) | |||
for (size_t group = 1; group <= 2; ++group) { | |||
pack(n, 512, 512, 15, 15, group, nlmode); | |||
pack(n, 512, 256, 15, 15, group, nlmode); | |||
pack(n, 256, 256, 29, 29, group, nlmode); | |||
pack(n, 256, 128, 29, 29, group, nlmode); | |||
pack(n, 128, 128, 57, 57, group, nlmode); | |||
pack(n, 128, 64, 57, 57, group, nlmode); | |||
pack(n, 24, 24, 224, 224, group, nlmode); | |||
pack(n, 64, 24, 123, 123, group, nlmode); | |||
pack(n, 64, 64, 56, 56, group, nlmode); | |||
pack(n, 128, 128, 28, 28, group, nlmode); | |||
pack(n, 256, 256, 14, 14, group, nlmode); | |||
pack(n, 512, 512, 7, 7, group, nlmode); | |||
} | |||
using namespace conv_bias; | |||
constexpr size_t RUN = 10; | |||
Benchmarker<ConvBias> benchmark_winograd_nchw(handle); | |||
benchmark_winograd_nchw.set_display(false); | |||
benchmark_winograd_nchw.set_times(RUN); | |||
Benchmarker<ConvBias> benchmark_winograd_nchw44(handle); | |||
benchmark_winograd_nchw44.set_display(false); | |||
benchmark_winograd_nchw44.set_times(RUN); | |||
std::string winograd_nchw_algo_name = ssprintf("WINOGRAD:%s", algo_name); | |||
std::string winograd_nchw44_algo_name = | |||
ssprintf("WINOGRAD_NCHW44:%s", algo_name); | |||
for (size_t i = 0; i < args_nchw.size(); ++i) { | |||
auto arg_nchw = args_nchw[i]; | |||
auto arg_nchw44 = args_nchw44[i]; | |||
TensorLayout dst_layout; | |||
auto opr = handle->create_operator<ConvBias>(); | |||
opr->param() = arg_nchw.param; | |||
opr->deduce_layout({arg_nchw.src, dtype::Float32()}, | |||
{arg_nchw.filter, dtype::Float32()}, | |||
{arg_nchw.bias, dtype::Float32()}, {}, dst_layout); | |||
//! dst.nr_elems * IC * FH * FW * 2 | |||
float computations = dst_layout.total_nr_elems() * arg_nchw.filter[1] * | |||
arg_nchw.filter[2] * arg_nchw.filter[3] * 2.0 / | |||
(1024 * 1024 * 1024) * 1e3; | |||
benchmark_winograd_nchw.set_param(arg_nchw.param); | |||
auto nchw_used = algo_benchmark<ConvBias>( | |||
benchmark_winograd_nchw, | |||
{arg_nchw.src, arg_nchw.filter, {}, {}, {}}, | |||
winograd_nchw_algo_name.c_str()) / | |||
RUN; | |||
benchmark_winograd_nchw44.set_param(arg_nchw44.param); | |||
auto nchw44_used = | |||
algo_benchmark<ConvBias>( | |||
benchmark_winograd_nchw44, | |||
{arg_nchw44.src, arg_nchw44.filter, {}, {}, {}}, | |||
winograd_nchw44_algo_name.c_str()) / | |||
RUN; | |||
printf("%s %s: nchw: %f ms %f Gflops nchw44: %f ms %f GFlops " | |||
"speedup: " | |||
"%f\n", | |||
arg_nchw.src.to_string().c_str(), | |||
arg_nchw.filter.to_string().c_str(), nchw_used, | |||
computations / nchw_used, nchw44_used, | |||
computations / nchw44_used, nchw_used / nchw44_used); | |||
} | |||
} | |||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_MK4_NCHW_VS_NCHW44) { | |||
#if MEGDNN_AARCH64 | |||
benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:2", handle()); | |||
#else | |||
benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:2", handle()); | |||
#endif | |||
} | |||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63_MK4_NCHW_VS_NCHW44) { | |||
#if MEGDNN_AARCH64 | |||
benchmark_winograd_nchw_vs_nchw44("AARCH64_F32_MK4_4x16:4:6", handle()); | |||
#else | |||
benchmark_winograd_nchw_vs_nchw44("ARMV7_F32_MK4_4x8:4:6", handle()); | |||
#endif | |||
} | |||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F23_8x8) { | |||
auto benchmark_winograd_quantized = [](const char* algo_name_fp32, | |||
const char* algo_name_quantized, | |||
@@ -654,6 +654,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4) { | |||
check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F23_4_NCHW44) { | |||
using namespace conv_bias; | |||
std::vector<TestArg> args = get_nchw44_conv_bias_args({3}, 1); | |||
Checker<ConvBiasForward> checker(handle()); | |||
check_winograd("4:2:32", checker, args, param::MatrixMul::Format::MK4, | |||
param::ConvBias::Format::NCHW44); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63) { | |||
using namespace conv_bias; | |||
std::vector<TestArg> args = get_winograd_args(3); | |||
@@ -667,7 +676,15 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4) { | |||
std::vector<TestArg> args = get_winograd_mk_packed_args(); | |||
Checker<ConvBiasForward> checker(handle()); | |||
check_winograd("4:6:32", checker, args, param::MatrixMul::Format::MK4); | |||
check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F63_4_NCHW44) { | |||
using namespace conv_bias; | |||
std::vector<TestArg> args = get_nchw44_conv_bias_args({3}, 1); | |||
Checker<ConvBiasForward> checker(handle()); | |||
check_winograd("4:6:16", checker, args, param::MatrixMul::Format::MK4, | |||
param::ConvBias::Format::NCHW44); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_F54) { | |||
@@ -761,6 +778,75 @@ TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD) { | |||
#endif | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_PREPROCESS_NCHW44) { | |||
using namespace conv_bias; | |||
std::vector<TestArg> nchw44_args = get_nchw44_conv_bias_args({3}, 1); | |||
Checker<ConvBiasForward> checker(handle()); | |||
auto extra_impl = [](const TensorNDArray& tensors, uint32_t m, | |||
param::ConvBias param, Handle* handle) { | |||
megdnn_assert(param.format == param::ConvBias::Format::NCHW44); | |||
auto winograd_preprocess_opr = | |||
handle->create_operator<WinogradFilterPreprocess>(); | |||
winograd_preprocess_opr->param().output_block_size = m; | |||
winograd_preprocess_opr->param().format = param::MatrixMul::Format::MK4; | |||
TensorLayout filter_transform_layout; | |||
winograd_preprocess_opr->deduce_layout(tensors[1].layout, | |||
filter_transform_layout); | |||
size_t winograd_preprocess_workspace_in_bytes = | |||
winograd_preprocess_opr->get_workspace_in_bytes( | |||
tensors[1].layout, filter_transform_layout); | |||
auto conv_bias_opr = handle->create_operator<ConvBias>(); | |||
conv_bias_opr->param() = param; | |||
conv_bias_opr->param().format = param::ConvBias::Format::NCHW44_WINOGRAD; | |||
conv_bias_opr->param().output_block_size = m; | |||
size_t conv_bias_workspace_in_bytes = | |||
conv_bias_opr->get_workspace_in_bytes( | |||
tensors[0].layout, filter_transform_layout, | |||
tensors[2].layout, tensors[3].layout, | |||
tensors[4].layout, nullptr); | |||
WorkspaceBundle wb(nullptr, {filter_transform_layout.span().dist_byte(), | |||
conv_bias_workspace_in_bytes, | |||
winograd_preprocess_workspace_in_bytes}); | |||
wb.set(malloc(wb.total_size_in_bytes())); | |||
TensorND filter_transform_tensor(wb.get(0), | |||
std::move(filter_transform_layout)); | |||
winograd_preprocess_opr->exec(tensors[1], filter_transform_tensor, | |||
wb.get_workspace(2)); | |||
conv_bias_opr->exec(tensors[0], filter_transform_tensor, tensors[2], | |||
tensors[3], tensors[4], nullptr, | |||
wb.get_workspace(1)); | |||
free(wb.ptr()); | |||
}; | |||
auto run = [&checker, &extra_impl]( | |||
Handle* handle, const std::vector<TestArg>& args, | |||
const std::vector<size_t>& out_size, DType A_dtype, | |||
DType B_dtype, DType C_dtype, DType D_dtype, | |||
const float eps) { | |||
for (auto&& arg : args) { | |||
for (uint32_t m : out_size) { | |||
checker.set_extra_opr_impl(std::bind(extra_impl, | |||
std::placeholders::_1, m, | |||
arg.param, handle)); | |||
checker.set_dtype(0, A_dtype) | |||
.set_dtype(1, B_dtype) | |||
.set_dtype(2, C_dtype) | |||
.set_dtype(4, D_dtype) | |||
.set_epsilon(eps) | |||
.set_param(arg.param) | |||
.execs({arg.src, arg.filter, arg.bias, {}, {}}); | |||
} | |||
} | |||
}; | |||
run(handle(), nchw44_args, {2, 6}, dtype::Float32(), dtype::Float32(), | |||
dtype::Float32(), dtype::Float32(), 1e-3f); | |||
} | |||
TEST_F(ARM_COMMON_MULTI_THREADS, CONV_BIAS_WINOGRAD_MK_PACKED_F32_1) { | |||
using namespace conv_bias; | |||
@@ -72,7 +72,7 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
auto&& inputs = conv_bias_opr.input(); | |||
VarNodeArray new_inp; | |||
new_inp.reserve(inputs.size()); | |||
for (auto i: inputs) { | |||
for (auto i : inputs) { | |||
new_inp.push_back(rewriter.get_var(i)); | |||
} | |||
@@ -86,11 +86,15 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
megdnn::ConvBias::parse_winograd_name(algo_name); | |||
if (winograd_param == megdnn::ConvBias::INVALID_WINOGRAD_PARAM) | |||
break; | |||
mgb_assert(conv_bias_opr.param().format == | |||
megdnn::ConvBias::Param::Format::NCHW || | |||
conv_bias_opr.param().format == | |||
megdnn::ConvBias::Param::Format::NCHW88, | |||
"currently winograd only suppport NCHW and nchw88"); | |||
mgb_assert( | |||
conv_bias_opr.param().format == | |||
megdnn::ConvBias::Param::Format::NCHW || | |||
conv_bias_opr.param().format == | |||
megdnn::ConvBias::Param::Format::NCHW88 || | |||
conv_bias_opr.param().format == | |||
megdnn::ConvBias::Param::Format::NCHW44, | |||
"currently winograd only suppport NCHW and NCHW44 and " | |||
"NCHW88"); | |||
opr::ConvBiasForward::check_winograd_param_valid( | |||
winograd_param, conv_bias_opr.input(0)->dtype()); | |||
megdnn::param::Winograd winograd_preprocess_param; | |||
@@ -110,8 +114,17 @@ void WinogradTransformReplacePass::apply(OptState& opt) const { | |||
megdnn::ConvBias::Param::Format::NCHW_WINOGRAD; | |||
} else { | |||
mgb_assert(new_inp[0]->shape().ndim == 5); | |||
conv_bias_param.format = | |||
megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||
size_t pack_size = new_inp[0]->shape()[4]; | |||
if (pack_size == 8) { | |||
conv_bias_param.format = | |||
megdnn::ConvBias::Param::Format::NCHW88_WINOGRAD; | |||
} else if (pack_size == 4) { | |||
conv_bias_param.format = | |||
megdnn::ConvBias::Param::Format::NCHW44_WINOGRAD; | |||
} else { | |||
mgb_assert(0, "Invalid pack size %zu in algo %s", pack_size, | |||
algo_name.c_str()); | |||
} | |||
} | |||
conv_bias_param.output_block_size = | |||
winograd_param.output_block_size; | |||