GitOrigin-RevId: a981b2f61b
release-1.11.1
@@ -177,6 +177,15 @@ | |||||
UNROLL_RAW_5x2(cb, v0, ##a) \ | UNROLL_RAW_5x2(cb, v0, ##a) \ | ||||
cb(5, 0, ##a) cb(5, 1, ##a) | cb(5, 0, ##a) cb(5, 1, ##a) | ||||
#define UNROLL_RAW_4x6(cb, v0, a...) \ | |||||
cb(0, 0, ##a) cb(0, 1, ##a) cb(0, 2, ##a) cb(0, 3, ##a) cb(0, 4, ##a) cb(0, 5, ##a) \ | |||||
cb(1, 0, ##a) cb(1, 1, ##a) cb(1, 2, ##a) cb(1, 3, ##a) cb(1, 4, ##a) cb(1, 5, ##a) \ | |||||
cb(2, 0, ##a) cb(2, 1, ##a) cb(2, 2, ##a) cb(2, 3, ##a) cb(2, 4, ##a) cb(2, 5, ##a) \ | |||||
cb(3, 0, ##a) cb(3, 1, ##a) cb(3, 2, ##a) cb(3, 3, ##a) cb(3, 4, ##a) cb(3, 5, ##a) | |||||
#define UNROLL_RAW_5x6(cb, v0, a...) \ | |||||
UNROLL_RAW_4x6(cb, v0, ##a) \ | |||||
cb(4, 0, ##a) cb(4, 1, ##a) cb(4, 2, ##a) cb(4, 3, ##a) cb(4, 4, ##a) cb(4, 5, ##a) | |||||
#define UNROLL_CALL0_D2(step, step2, cb, v...) \ | #define UNROLL_CALL0_D2(step, step2, cb, v...) \ | ||||
UNROLL_RAW_##step##x##step2(cb, 0, ##v) | UNROLL_RAW_##step##x##step2(cb, 0, ##v) | ||||
#define UNROLL_CALL1_D2(step, step2, cb, v...) \ | #define UNROLL_CALL1_D2(step, step2, cb, v...) \ | ||||
@@ -218,6 +218,44 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL( | |||||
AlgoFP32WinogradF63_4x4, winograd::winograd_6x3_4x4_f, | AlgoFP32WinogradF63_4x4, winograd::winograd_6x3_4x4_f, | ||||
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); | megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); | ||||
/* ======================= AlgoFP32WinogradF43_4x4 ======================== */ | |||||
bool ConvBiasImpl::AlgoFP32WinogradF43_4x4::usable( | |||||
const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN(megdnn_fallback_winograd_fp32, 6, 0) { | |||||
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||||
return false; | |||||
using Strategy = winograd::winograd_4x3_4x4_f; | |||||
using PackMode = fallback::MatrixMulImpl::AlgoBase::PackMode; | |||||
Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||||
auto&& matmul_param = | |||||
megdnn::winograd::ConvBias<Strategy, param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param) | |||||
.get_matmul_kern_param(param); | |||||
return m_matmul_algo->usable(matmul_param) && | |||||
m_matmul_algo->packmode() == PackMode::NO_PACK && | |||||
param.filter_meta.format == param::ConvBias::Format::NCHW && | |||||
!param.filter_meta.should_flip && | |||||
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||||
param.filter_meta.spatial[0] == 3) && | |||||
(param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||||
param.filter_meta.stride[0] == 1) && | |||||
(param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && | |||||
param.filter_meta.dilation[0] == 1) && | |||||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||||
param.src_type.enumv() == DTypeEnum::Float32 && | |||||
param.filter_meta.icpg % 4 == 0 && param.filter_meta.ocpg % 4 == 0; | |||||
} | |||||
MIDOUT_END(); | |||||
return false; | |||||
} | |||||
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL( | |||||
AlgoFP32WinogradF43_4x4, winograd::winograd_4x3_4x4_f, | |||||
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); | |||||
/* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ | /* =================== AlgoFP32WinogradF23_4x4_NCHW44 =================== */ | ||||
bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( | bool ConvBiasImpl::AlgoFP32WinogradF23_4x4_NCHW44::usable( | ||||
@@ -297,6 +335,46 @@ MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL( | |||||
AlgoFP32WinogradF63_4x4_NCHW44, winograd::winograd_F63_mk4_f_nchw44, | AlgoFP32WinogradF63_4x4_NCHW44, winograd::winograd_F63_mk4_f_nchw44, | ||||
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); | megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); | ||||
/* =================== AlgoFP32WinogradF43_4x4_NCHW44 ===================== */ | |||||
bool ConvBiasImpl::AlgoFP32WinogradF43_4x4_NCHW44::usable( | |||||
const NCBKernSizeParam& param, | |||||
AlgoSelectionStrategy /*algo_selection_strategy*/) const { | |||||
MEGDNN_MARK_USED_VAR(param); | |||||
MIDOUT_BEGIN( | |||||
megdnn_fallback_winograd_fp32, | |||||
midout_iv("AlgoFP32WinogradF43_4x4_NCHW44"_hash)) { | |||||
if (param.filter_meta.icpg % 4 != 0 || param.filter_meta.ocpg % 4 != 0) | |||||
return false; | |||||
using Strategy = winograd::winograd_F43_mk4_f_nchw44; | |||||
Strategy strategy(param.src_type, param.filter_type, param.dst_type); | |||||
auto&& matmul_param = | |||||
megdnn::winograd::ConvBias<Strategy, param::MatrixMul::Format::MK4>( | |||||
strategy, m_tile_size, param) | |||||
.get_matmul_kern_param(param); | |||||
return m_matmul_algo->usable(matmul_param) && | |||||
m_matmul_algo->packmode() == | |||||
fallback::MatrixMulImpl::AlgoBase::PackMode::NO_PACK && | |||||
param.filter_meta.format == param::ConvBias::Format::NCHW44 && | |||||
!param.filter_meta.should_flip && | |||||
(param.filter_meta.spatial[0] == param.filter_meta.spatial[1] && | |||||
param.filter_meta.spatial[0] == 3) && | |||||
(param.filter_meta.stride[0] == param.filter_meta.stride[1] && | |||||
param.filter_meta.stride[0] == 1) && | |||||
(param.filter_meta.dilation[0] == param.filter_meta.dilation[1] && | |||||
param.filter_meta.dilation[0] == 1) && | |||||
param.compute_mode == param::ConvBias::ComputeMode::DEFAULT && | |||||
param.src_type.enumv() == DTypeEnum::Float32 && | |||||
param.filter_meta.icpg % 4 == 0 && param.filter_meta.ocpg % 4 == 0; | |||||
} | |||||
MIDOUT_END(); | |||||
return false; | |||||
} | |||||
MEGDNN_WINOGRAD_ALGO_FUN_DEFINE_ALL( | |||||
AlgoFP32WinogradF43_4x4_NCHW44, winograd::winograd_F43_mk4_f_nchw44, | |||||
megdnn_fallback_winograd_fp32, param::MatrixMul::Format::MK4); | |||||
/* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */ | /* =================== AlgoFP32WinogradF73_4x4_NCHW44 ===================== */ | ||||
bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable( | bool ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44::usable( | ||||
@@ -81,6 +81,23 @@ public: | |||||
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_FP32) | MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_FP32) | ||||
}; | }; | ||||
class ConvBiasImpl::AlgoFP32WinogradF43_4x4 final : public AlgoBase { | |||||
public: | |||||
AlgoFP32WinogradF43_4x4( | |||||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||||
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||||
m_matmul_algo->name(), {4, 4, m_tile_size, 3}); | |||||
} | |||||
return m_name.c_str(); | |||||
} | |||||
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32); | |||||
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F43_4X4_FP32) | |||||
}; | |||||
class ConvBiasImpl::AlgoFP32WinogradF54 final : public AlgoBase { | class ConvBiasImpl::AlgoFP32WinogradF54 final : public AlgoBase { | ||||
public: | public: | ||||
AlgoFP32WinogradF54( | AlgoFP32WinogradF54( | ||||
@@ -156,6 +173,24 @@ public: | |||||
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32) | MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32) | ||||
}; | }; | ||||
class ConvBiasImpl::AlgoFP32WinogradF43_4x4_NCHW44 final : public AlgoBase { | |||||
public: | |||||
AlgoFP32WinogradF43_4x4_NCHW44( | |||||
fallback::MatrixMulImpl::AlgoBase* matmul_algo, uint32_t tile_size) | |||||
: m_matmul_algo{matmul_algo}, m_tile_size{tile_size} {} | |||||
const char* name() const override { | |||||
if (m_name.empty()) { | |||||
m_name = ConvBiasImpl::algo_name<ConvBias::WinogradParam>( | |||||
m_matmul_algo->name(), {4, 4, m_tile_size, 3}, | |||||
param::ConvBias::Format::NCHW44); | |||||
} | |||||
return m_name.c_str(); | |||||
} | |||||
AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; } | |||||
MEGDNN_WINOGRAD_ALGO_FUN_DECLARE(AlgoDataType::FLOAT32); | |||||
MEGDNN_DECL_ALGO_TYPE(GI_COMMON_WINOGRAD_F43_4X4_NCHW44_F32) | |||||
}; | |||||
class ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44 final : public AlgoBase { | class ConvBiasImpl::AlgoFP32WinogradF73_4x4_NCHW44 final : public AlgoBase { | ||||
public: | public: | ||||
AlgoFP32WinogradF73_4x4_NCHW44( | AlgoFP32WinogradF73_4x4_NCHW44( | ||||
@@ -16,6 +16,8 @@ MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 1, 1, winograd_4x | |||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, winograd_6x3_4x4_f) | MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 6, 3, 4, 4, winograd_6x3_4x4_f) | ||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 3, 4, 4, winograd_4x3_4x4_f) | |||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, winograd_5x4_1x1_f) | MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 5, 4, 1, 1, winograd_5x4_1x1_f) | ||||
MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, winograd_4x5_1x1_f) | MEGDNN_REG_WINOGRAD_STRATEGY(float, float, float, float, 4, 5, 1, 1, winograd_4x5_1x1_f) | ||||
@@ -27,6 +29,9 @@ MEGDNN_REG_WINOGRAD_STRATEGY( | |||||
float, float, float, float, 6, 3, 4, 4, winograd_F63_mk4_f_nchw44) | float, float, float, float, 6, 3, 4, 4, winograd_F63_mk4_f_nchw44) | ||||
MEGDNN_REG_WINOGRAD_STRATEGY( | MEGDNN_REG_WINOGRAD_STRATEGY( | ||||
float, float, float, float, 4, 3, 4, 4, winograd_F43_mk4_f_nchw44) | |||||
MEGDNN_REG_WINOGRAD_STRATEGY( | |||||
float, float, float, float, 7, 3, 4, 4, winograd_F73_mk4_f_nchw44) | float, float, float, float, 7, 3, 4, 4, winograd_F73_mk4_f_nchw44) | ||||
} // namespace winograd | } // namespace winograd | ||||
} // namespace fallback | } // namespace fallback | ||||
@@ -0,0 +1,340 @@ | |||||
#include "src/common/unroll_macro.h" | |||||
#include "src/common/utils.h" | |||||
#include "src/common/winograd/winograd_helper.h" | |||||
#include "src/fallback/conv_bias/gi/fp32/filter_transform.h" | |||||
#include "src/fallback/conv_bias/gi/fp32/helper.h" | |||||
#include "src/fallback/conv_bias/gi/fp32/strategy.h" | |||||
#include "src/fallback/conv_bias/winograd/winograd.h" | |||||
#include "src/fallback/elemwise_helper/op_unary.h" | |||||
#include "midout.h" | |||||
MIDOUT_DECL(megdnn_fallback_winograd_fp32_F43_4x4) | |||||
using namespace megdnn; | |||||
using namespace fallback; | |||||
namespace { | |||||
#define MLAF GiMultiplyAddScalarFloat32 | |||||
#define MLSF GiMultiplySubScalarFloat32 | |||||
struct InputTransform4X3 { | |||||
/** | |||||
* @brief Convert layout from NCHW to NCHW44(i.e. NC4HW4) | |||||
* | |||||
* @tparam inner Whether all data in [[ih_start, ih_start+6), [iw_start, | |||||
* iw_start+6)] is in @input | |||||
* @param input Pointer which points to all input data(CHW, exclude dim N) | |||||
* @param patch Buffer which size is sizeof(float) * 4 * 6 * 6. Continuous storage | |||||
* of data for the current block, order by C, H, W. | |||||
* @param patchT RETURN | |||||
* @param ih_start The start index of dim H of current block | |||||
* @param iw_start The start index of dim W of current block | |||||
* @param IH Dim H of input | |||||
* @param IW Dim W of input | |||||
* @param ic The index of dim C of input | |||||
* @param IC Dim C of input | |||||
*/ | |||||
template <bool inner> | |||||
static void transpose( | |||||
const float* input, float* patch, float* patchT, int ih_start, int iw_start, | |||||
size_t IH, size_t IW, size_t ic, size_t IC) { | |||||
constexpr size_t alpha = 4 + 3 - 1; | |||||
if (!inner || ic + 4 > IC) { | |||||
memset(patch, 0, sizeof(float) * 4 * alpha * alpha); | |||||
} | |||||
if (inner) { | |||||
const float* input_ptr = input + ic * IH * IW + ih_start * IW + iw_start; | |||||
for (size_t ico = 0; ico < 4; ++ico) { | |||||
if (ic + ico < IC) { | |||||
#define cb(i) \ | |||||
auto v##i##0 = GiLoadFloat32(input_ptr + i * IW); \ | |||||
GiStoreFloat32(patch + ico * alpha * alpha + i * alpha, v##i##0); \ | |||||
auto v##i##1 = GiLoadFloat32LowHalf(input_ptr + i * IW + 4); \ | |||||
GiStoreFloat32(patch + ico * alpha * alpha + i * alpha + 4, v##i##1); | |||||
UNROLL_CALL_NOWRAPPER(6, cb); | |||||
#undef cb | |||||
input_ptr += IH * IW; | |||||
} | |||||
} | |||||
} else { | |||||
size_t ih0 = std::max(0, ih_start), ih1 = std::min(ih_start + alpha, IH), | |||||
iw0 = std::max(0, iw_start), iw1 = std::min(iw_start + alpha, IW); | |||||
for (size_t ico = 0; ico < 4 && ic + ico < IC; ++ico) { | |||||
for (size_t ih = ih0; ih < ih1; ++ih) { | |||||
for (size_t iw = iw0; iw < iw1; ++iw) { | |||||
patch[ico * alpha * alpha + (ih - ih_start) * alpha + | |||||
(iw - iw_start)] = | |||||
input[(ic + ico) * IH * IW + ih * IW + iw]; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
#define cb(i) transpose_4x4(patch + i * 4, patchT + i * 16, 36, 4); | |||||
UNROLL_CALL_NOWRAPPER(9, cb); | |||||
#undef cb | |||||
} | |||||
static void transform( | |||||
const float* patchT, float* input_transform_buf, size_t unit_idx, | |||||
size_t nr_units_in_tile, size_t ic, size_t IC) { | |||||
constexpr size_t alpha = 4 + 3 - 1; | |||||
#define cb(m, n) \ | |||||
GI_FLOAT32_t d##m##n = GiLoadFloat32(patchT + m * alpha * 4 + n * 4), wd##m##n; | |||||
UNROLL_CALL_NOWRAPPER_D2(6, 6, cb); | |||||
#undef cb | |||||
//! BT | |||||
//! 4 0 -5 0 1 0 | |||||
//! 0 -4 -4 1 1 0 | |||||
//! 0 4 -4 -1 1 0 | |||||
//! 0 -2 -1 2 1 0 | |||||
//! 0 2 -1 -2 1 0 | |||||
//! 0 4 0 -5 0 1 | |||||
//! wd0n = 4 * (d0n - d2n) + (d4n - d2n) | |||||
//! wd1n = (d3n + d4n) - 4 * (d1n + d2n) | |||||
//! wd2n = 4 * (d1n - d2n) + (d4n - d3n) | |||||
//! wd3n = (d4n - d2n) - 2 * (d1n - d3n) | |||||
//! wd4n = 2 * (d1n - d3n) + (d4n - d2n) | |||||
//! wd5n = 4 * (d1n - d3n) + (d5n - d3n) | |||||
#define cb(n) \ | |||||
{ \ | |||||
auto&& d4subd2 = SUBF(d4##n, d2##n); \ | |||||
auto&& d1subd3 = SUBF(d1##n, d3##n); \ | |||||
wd0##n = MLAF(d4subd2, SUBF(d0##n, d2##n), 4.0f); \ | |||||
wd1##n = MLSF(ADDF(d3##n, d4##n), ADDF(d1##n, d2##n), 4.0f); \ | |||||
wd2##n = MLAF(SUBF(d4##n, d3##n), SUBF(d1##n, d2##n), 4.0f); \ | |||||
auto&& double_d1subd3 = MULSF(d1subd3, 2.0f); \ | |||||
wd3##n = SUBF(d4subd2, double_d1subd3); \ | |||||
wd4##n = ADDF(double_d1subd3, d4subd2); \ | |||||
wd5##n = MLAF(SUBF(d5##n, d3##n), d1subd3, 4.0f); \ | |||||
} | |||||
UNROLL_CALL_NOWRAPPER(6, cb); | |||||
#undef cb | |||||
//! B | |||||
//! 4 0 0 0 0 0 | |||||
//! 0 -4 4 -2 2 4 | |||||
//! -5 -4 -4 -1 -1 0 | |||||
//! 0 1 -1 2 -2 -5 | |||||
//! 1 1 1 1 1 0 | |||||
//! 0 0 0 0 0 1 | |||||
//! dm0 = 4 * (wdm0 - wdm2) + (wdm4 - wdm2) | |||||
//! dm1 = (wdm3 + wdm4) - 4 * (wdm1 + wdm2) | |||||
//! dm2 = 4 * (wdm1 - wdm2) + (wdm4 - wdm3) | |||||
//! dm3 = (wdm4 - wdm2) - 2 * (wdm1 - wdm3) | |||||
//! dm4 = 2 * (wdm1 - wdm3) + (wdm4 - wdm2) | |||||
//! dm5 = 4 * (wdm1 - wdm3) + (wdm5 - wdm3) | |||||
#define cb(m) \ | |||||
{ \ | |||||
auto&& wd4subwd2 = SUBF(wd##m##4, wd##m##2); \ | |||||
auto&& wd1subwd3 = SUBF(wd##m##1, wd##m##3); \ | |||||
d##m##0 = MLAF(wd4subwd2, SUBF(wd##m##0, wd##m##2), 4.0f); \ | |||||
d##m##1 = MLSF(ADDF(wd##m##3, wd##m##4), ADDF(wd##m##1, wd##m##2), 4.0f); \ | |||||
d##m##2 = MLAF(SUBF(wd##m##4, wd##m##3), SUBF(wd##m##1, wd##m##2), 4.0f); \ | |||||
auto&& double_wd1subwd3 = MULSF(wd1subwd3, 2.0f); \ | |||||
d##m##3 = SUBF(wd4subwd2, double_wd1subwd3); \ | |||||
d##m##4 = ADDF(double_wd1subwd3, wd4subwd2); \ | |||||
d##m##5 = MLAF(SUBF(wd##m##5, wd##m##3), wd1subwd3, 4.0f); \ | |||||
} | |||||
UNROLL_CALL_NOWRAPPER(6, cb); | |||||
#undef cb | |||||
size_t ICB = IC / 4; | |||||
size_t icb = ic / 4; | |||||
#define cb(m, n) \ | |||||
GiStoreFloat32( \ | |||||
input_transform_buf + (m * alpha + n) * ICB * 4 * nr_units_in_tile + \ | |||||
icb * nr_units_in_tile * 4 + unit_idx * 4, \ | |||||
d##m##n); | |||||
UNROLL_CALL_NOWRAPPER_D2(6, 6, cb); | |||||
#undef cb | |||||
} | |||||
}; // InputTransform4X3 | |||||
template <BiasMode bmode, typename Op> | |||||
struct OutputTransform4X3 { | |||||
static void transform( | |||||
const float* output_transform_buf, const float* bias, float* output, | |||||
float* transform_mid_buf, size_t oh_start, size_t ow_start, size_t OH, | |||||
size_t OW, size_t oc_start, size_t oc_end, size_t oc_index, size_t unit_idx, | |||||
size_t nr_units_in_tile, const DType& src_dtype, const DType& dst_dtype) { | |||||
Op op(src_dtype, dst_dtype); | |||||
constexpr size_t alpha = 4 + 3 - 1; | |||||
size_t oc = oc_start + oc_index; | |||||
size_t OCB = (oc_end - oc_start) / 4; | |||||
size_t ocb = oc_index / 4; | |||||
#define cb(m, n) \ | |||||
auto v##m##n = GiLoadFloat32( \ | |||||
output_transform_buf + (m * alpha + n) * OCB * nr_units_in_tile * 4 + \ | |||||
ocb * nr_units_in_tile * 4 + unit_idx * 4); | |||||
UNROLL_CALL_NOWRAPPER_D2(6, 6, cb); | |||||
#undef cb | |||||
//! AT | |||||
//! 1 1 1 1 1 0 | |||||
//! 0 1 -1 2 -2 0 | |||||
//! 0 1 1 4 4 0 | |||||
//! 0 1 -1 8 -8 1 | |||||
//! t0n = v0n + (v1n + v2n) + (v3n + v4n) | |||||
//! t1n = (v1n - v2n) + 2 * (v3n - v4n) | |||||
//! t2n = (v1n + v2n) + 4 * (v3n + v4n) | |||||
//! t3n = (v1n - v2n) + 8 * (v3n - v4n) + v5n | |||||
#define cb(m, n) GI_FLOAT32_t t##m##n; | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 6, cb); | |||||
#undef cb | |||||
#define cb(n) \ | |||||
{ \ | |||||
auto&& v1addv2 = ADDF(v1##n, v2##n); \ | |||||
auto&& v1subv2 = SUBF(v1##n, v2##n); \ | |||||
auto&& v3addv4 = ADDF(v3##n, v4##n); \ | |||||
auto&& v3subv4 = SUBF(v3##n, v4##n); \ | |||||
\ | |||||
t0##n = ADDF(ADDF(v0##n, v1addv2), v3addv4); \ | |||||
t1##n = MLAF(v1subv2, v3subv4, 2.0f); \ | |||||
t2##n = MLAF(v1addv2, v3addv4, 4.0f); \ | |||||
t3##n = ADDF(MLAF(v1subv2, v3subv4, 8.0f), v5##n); \ | |||||
} | |||||
UNROLL_CALL_NOWRAPPER(6, cb); | |||||
#undef cb | |||||
//! A | |||||
//! 1 0 0 0 | |||||
//! 1 1 1 1 | |||||
//! 1 -1 1 -1 | |||||
//! 1 2 4 8 | |||||
//! 1 -2 4 -8 | |||||
//! 0 0 0 1 | |||||
// vm0 = tm0 + (tm1 + tm2) + (tm3 + tm4) | |||||
// vm1 = (tm1 - tm2) + 2 * (tm3 - tm4) | |||||
// vm2 = (tm1 + tm2) + 4 * (tm3 + tm4) | |||||
// vm3 = (tm1 - tm2) + 8 * (tm3 - tm4) + tm5 | |||||
#define cb(m) \ | |||||
{ \ | |||||
auto&& t1addt2 = ADDF(t##m##1, t##m##2); \ | |||||
auto&& t1subt2 = SUBF(t##m##1, t##m##2); \ | |||||
auto&& t3addt4 = ADDF(t##m##3, t##m##4); \ | |||||
auto&& t3subt4 = SUBF(t##m##3, t##m##4); \ | |||||
v##m##0 = ADDF(ADDF(t##m##0, t1addt2), t3addt4); \ | |||||
v##m##1 = MLAF(t1subt2, t3subt4, 2.0f); \ | |||||
v##m##2 = MLAF(t1addt2, t3addt4, 4.0f); \ | |||||
v##m##3 = ADDF(MLAF(t1subt2, t3subt4, 8.0f), t##m##5); \ | |||||
} | |||||
UNROLL_CALL_NOWRAPPER(4, cb); | |||||
#undef cb | |||||
GI_FLOAT32_t vbias; | |||||
if (bmode == BiasMode::BROADCAST_CHANNEL_BIAS) { | |||||
vbias = GiLoadFloat32(bias + oc); | |||||
#define cb(m, n) v##m##n = GiAddFloat32(v##m##n, vbias); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
#undef cb | |||||
} | |||||
if (bmode != BiasMode::BIAS) { | |||||
#define cb(m, n) v##m##n = op(v##m##n); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
#undef cb | |||||
} | |||||
#define cb(m, n) GiStoreFloat32(transform_mid_buf + (4 * m + n) * 4, v##m##n); | |||||
UNROLL_CALL_NOWRAPPER_D2(4, 4, cb); | |||||
#undef cb | |||||
for (size_t oho = 0; oho < 4 && oh_start + oho < OH; ++oho) { | |||||
for (size_t owo = 0; owo < 4 && ow_start + owo < OW; ++owo) { | |||||
for (size_t oco = 0; oco < 4 && oc + oco < oc_end; ++oco) { | |||||
float res = transform_mid_buf[oho * 4 * 4 + owo * 4 + oco]; | |||||
size_t oh = oh_start + oho; | |||||
size_t ow = ow_start + owo; | |||||
if (bmode == BiasMode::BIAS) { | |||||
res += bias[(oc + oco) * OH * OW + oh * OW + ow]; | |||||
res = op(res); | |||||
} | |||||
output[(oc + oco) * OH * OW + oh * OW + ow] = res; | |||||
} | |||||
} | |||||
} | |||||
} | |||||
}; // OutputTransform4X3 | |||||
#undef MLSF | |||||
#undef MLAF | |||||
} // namespace | |||||
namespace megdnn { | |||||
namespace fallback { | |||||
namespace winograd { | |||||
MEGDNN_REG_WINOGRAD_STRATEGY_IMPL(winograd_4x3_4x4_f) | |||||
void winograd_4x3_4x4_f::filter( | |||||
const float* filter, float* filter_transform_buf, float* transform_mid_buf, | |||||
size_t OC, size_t IC, size_t oc_start, size_t oc_end) { | |||||
FilterTransform4X3<megdnn::param::MatrixMul::Format::MK4>::transform( | |||||
filter, filter_transform_buf, transform_mid_buf, OC, IC, oc_start, oc_end); | |||||
} | |||||
void winograd_4x3_4x4_f::input( | |||||
const float* input, float* input_transform_buf, float* transform_mid_buf, | |||||
size_t IH, size_t IW, size_t IC, size_t PH, size_t PW, size_t unit_start_idx, | |||||
size_t nr_units_in_tile) { | |||||
megdnn_assert(IC % 4 == 0); | |||||
auto unit_w = div_ceil<size_t>(IW + 2 * PW - KERNEL_SIZE + 1, OUTPUT_BLOCK_SIZE); | |||||
float* patch = transform_mid_buf; | |||||
float* patchT = transform_mid_buf + 4 * ALPHA * ALPHA; | |||||
for (size_t ic = 0; ic < IC; ic += 4) { | |||||
for (size_t unit_idx = 0; unit_idx < nr_units_in_tile; ++unit_idx) { | |||||
size_t index = unit_start_idx + unit_idx; | |||||
size_t oht = index / unit_w; | |||||
size_t owt = index % unit_w; | |||||
int ih_start = static_cast<int>(oht * OUTPUT_BLOCK_SIZE - PH); | |||||
int iw_start = static_cast<int>(owt * OUTPUT_BLOCK_SIZE - PW); | |||||
if (ih_start >= 0 && ih_start + 6 <= static_cast<int>(IH) && | |||||
iw_start >= 0 && iw_start + 6 <= static_cast<int>(IW)) { | |||||
InputTransform4X3::transpose<true>( | |||||
input, patch, patchT, ih_start, iw_start, IH, IW, ic, IC); | |||||
} else { | |||||
InputTransform4X3::transpose<false>( | |||||
input, patch, patchT, ih_start, iw_start, IH, IW, ic, IC); | |||||
} | |||||
InputTransform4X3::transform( | |||||
patchT, input_transform_buf, unit_idx, nr_units_in_tile, ic, IC); | |||||
} | |||||
} | |||||
} | |||||
void winograd_4x3_4x4_f::output( | |||||
const float* output_transform_buf, const float* bias, float* output, | |||||
float* transform_mid_buf, BiasMode bmode, NonlineMode nonline_mode, size_t OH, | |||||
size_t OW, size_t oc_start, size_t oc_end, size_t unit_start_idx, | |||||
size_t nr_units_in_tile) { | |||||
#define cb(_bmode, _nonline_mode, ...) \ | |||||
OutputTransform4X3<_bmode, _nonline_mode>::transform(__VA_ARGS__); | |||||
auto unit_w = div_ceil<size_t>(OW, OUTPUT_BLOCK_SIZE); | |||||
for (size_t oc = oc_start; oc < oc_end; oc += 4) { | |||||
size_t oc_index = oc - oc_start; | |||||
for (size_t unit_idx = 0; unit_idx < nr_units_in_tile; ++unit_idx) { | |||||
size_t index = unit_idx + unit_start_idx; | |||||
size_t oht = index / unit_w; | |||||
size_t owt = index % unit_w; | |||||
size_t oh_start = oht * OUTPUT_BLOCK_SIZE; | |||||
size_t ow_start = owt * OUTPUT_BLOCK_SIZE; | |||||
GI_DISPATCH_CONV_WINOGRAD_BIAS( | |||||
megdnn_fallback_winograd_fp32_F43_4x4, cb, float, float, bmode, | |||||
nonline_mode, output_transform_buf, bias, output, transform_mid_buf, | |||||
oh_start, ow_start, OH, OW, oc_start, oc_end, oc_index, unit_idx, | |||||
nr_units_in_tile, src_dtype, dst_dtype); | |||||
} | |||||
} | |||||
#undef cb | |||||
} | |||||
} // namespace winograd | |||||
} // namespace fallback | |||||
} // namespace megdnn |
@@ -121,7 +121,7 @@ public: | |||||
for (auto&& algo : matmul_algos) { | for (auto&& algo : matmul_algos) { | ||||
if (is_naive(algo)) | if (is_naive(algo)) | ||||
continue; | continue; | ||||
for (uint32_t tile_size : {16, 8, 24, 32}) { | |||||
for (uint32_t tile_size : {16, 8, 24, 32, 48, 68}) { | |||||
refhold.emplace_back(new AlgoFP32WinogradF23_4x4( | refhold.emplace_back(new AlgoFP32WinogradF23_4x4( | ||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
tile_size)); | tile_size)); | ||||
@@ -130,10 +130,18 @@ public: | |||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
tile_size)); | tile_size)); | ||||
m_gi_winograd_algos.emplace_back(refhold.back().get()); | m_gi_winograd_algos.emplace_back(refhold.back().get()); | ||||
refhold.emplace_back(new AlgoFP32WinogradF43_4x4( | |||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||||
tile_size)); | |||||
m_gi_winograd_algos.emplace_back(refhold.back().get()); | |||||
refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44( | refhold.emplace_back(new AlgoFP32WinogradF63_4x4_NCHW44( | ||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
tile_size)); | tile_size)); | ||||
m_gi_winograd_algos.emplace_back(refhold.back().get()); | m_gi_winograd_algos.emplace_back(refhold.back().get()); | ||||
refhold.emplace_back(new AlgoFP32WinogradF43_4x4_NCHW44( | |||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | |||||
tile_size)); | |||||
m_gi_winograd_algos.emplace_back(refhold.back().get()); | |||||
refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44( | refhold.emplace_back(new AlgoFP32WinogradF23_4x4_NCHW44( | ||||
static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | static_cast<fallback::MatrixMulImpl::AlgoBase*>(algo), | ||||
tile_size)); | tile_size)); | ||||
@@ -219,9 +219,11 @@ public: | |||||
GI_COMMON_WINOGRAD_F63_FP32, | GI_COMMON_WINOGRAD_F63_FP32, | ||||
GI_COMMON_WINOGRAD_F43_FP32, | GI_COMMON_WINOGRAD_F43_FP32, | ||||
GI_COMMON_WINOGRAD_F63_4X4_FP32, | GI_COMMON_WINOGRAD_F63_4X4_FP32, | ||||
GI_COMMON_WINOGRAD_F43_4X4_FP32, | |||||
GI_COMMON_WINOGRAD_F54_FP32, | GI_COMMON_WINOGRAD_F54_FP32, | ||||
GI_COMMON_WINOGRAD_F45_FP32, | GI_COMMON_WINOGRAD_F45_FP32, | ||||
GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32, | GI_COMMON_WINOGRAD_F23_4X4_NCHW44_F32, | ||||
GI_COMMON_WINOGRAD_F43_4X4_NCHW44_F32, | |||||
GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32, | GI_COMMON_WINOGRAD_F63_4X4_NCHW44_F32, | ||||
GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32, | GI_COMMON_WINOGRAD_F73_4X4_NCHW44_F32, | ||||
GI_COMMON_DIRECT_FP32, | GI_COMMON_DIRECT_FP32, | ||||
@@ -382,9 +384,11 @@ private: | |||||
class AlgoFP32WinogradF63; | class AlgoFP32WinogradF63; | ||||
class AlgoFP32WinogradF43; | class AlgoFP32WinogradF43; | ||||
class AlgoFP32WinogradF63_4x4; | class AlgoFP32WinogradF63_4x4; | ||||
class AlgoFP32WinogradF43_4x4; | |||||
class AlgoFP32WinogradF54; | class AlgoFP32WinogradF54; | ||||
class AlgoFP32WinogradF45; | class AlgoFP32WinogradF45; | ||||
class AlgoFP32WinogradF23_4x4_NCHW44; | class AlgoFP32WinogradF23_4x4_NCHW44; | ||||
class AlgoFP32WinogradF43_4x4_NCHW44; | |||||
class AlgoFP32WinogradF63_4x4_NCHW44; | class AlgoFP32WinogradF63_4x4_NCHW44; | ||||
class AlgoFP32WinogradF73_4x4_NCHW44; | class AlgoFP32WinogradF73_4x4_NCHW44; | ||||
@@ -1013,6 +1013,27 @@ TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F43_F63) { | |||||
handle(), 3); | handle(), 3); | ||||
#endif | #endif | ||||
} | } | ||||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_44_F43_F23) { | |||||
#if MEGDNN_AARCH64 | |||||
benchmark_winograd_compare( | |||||
"WINOGRAD:.*:4:4:.*:3", "WINOGRAD:.*:4:2", handle(), 3, 4); | |||||
#endif | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_WINOGRAD_F43_44) { | |||||
#if MEGDNN_AARCH64 | |||||
benchmark_winograd_weight_preprocess("WINOGRAD:.*:4:4:.*:3", handle(), 3, 4); | |||||
#endif | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_WINOGRAD_F43_NCHW44) { | |||||
#if MEGDNN_AARCH64 | |||||
benchmark_winograd_weight_preprocess( | |||||
"WINOGRAD_NCHW44:.*:4:4:.*:3", handle(), 3, 4, 4); | |||||
#endif | |||||
} | |||||
TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) { | TEST_F(ARM_COMMON, BENCHMARK_CONVBIAS_WINOGRAD_F63) { | ||||
#if MEGDNN_AARCH64 | #if MEGDNN_AARCH64 | ||||
benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3); | benchmark_winograd("WINOGRAD:AARCH64_F32K8X12X1:1:6", handle(), 3); | ||||
@@ -902,7 +902,8 @@ void check_conv_bias( | |||||
} | } | ||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | ||||
size_t kernel, size_t pack_size) { | |||||
size_t kernel, size_t pack_size, size_t io_pack_size) { | |||||
megdnn_assert(io_pack_size == 1 || io_pack_size == 4); | |||||
std::vector<conv_bias::TestArg> args; | std::vector<conv_bias::TestArg> args; | ||||
auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) { | auto pack = [&](size_t oc, size_t ic, size_t w, size_t h, size_t kernel, size_t p) { | ||||
if (ic % pack_size != 0 || oc % pack_size != 0) | if (ic % pack_size != 0 || oc % pack_size != 0) | ||||
@@ -915,11 +916,20 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | |||||
param.pad_h = p; | param.pad_h = p; | ||||
param.pad_w = p; | param.pad_w = p; | ||||
args.push_back(conv_bias::TestArg{ | |||||
param, | |||||
TensorShape{1, ic, h, w}, | |||||
TensorShape{oc, ic, kernel, kernel}, | |||||
{1, oc, 1, 1}}); | |||||
if (io_pack_size == 4) { | |||||
param.format = param::ConvBias::Format::NCHW44; | |||||
args.push_back(conv_bias::TestArg{ | |||||
param, | |||||
TensorShape{1, ic / 4, h, w, 4}, | |||||
TensorShape{oc / 4, ic / 4, kernel, kernel, 4, 4}, | |||||
{1, oc / 4, 1, 1, 4}}); | |||||
} else { | |||||
args.push_back(conv_bias::TestArg{ | |||||
param, | |||||
TensorShape{1, ic, h, w}, | |||||
TensorShape{oc, ic, kernel, kernel}, | |||||
{1, oc, 1, 1}}); | |||||
} | |||||
}; | }; | ||||
for (size_t ic : {8, 16, 32, 64}) { | for (size_t ic : {8, 16, 32, 64}) { | ||||
@@ -950,8 +960,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | |||||
} | } | ||||
void benchmark_winograd( | void benchmark_winograd( | ||||
const char* algo_name, Handle* handle, size_t kernel, size_t pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||||
const char* algo_name, Handle* handle, size_t kernel, size_t pack_size, | |||||
size_t io_pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size, io_pack_size); | |||||
using namespace conv_bias; | using namespace conv_bias; | ||||
constexpr size_t RUN = 10; | constexpr size_t RUN = 10; | ||||
Benchmarker<Convolution> benchmark(handle); | Benchmarker<Convolution> benchmark(handle); | ||||
@@ -969,10 +980,17 @@ void benchmark_winograd( | |||||
opr->deduce_layout( | opr->deduce_layout( | ||||
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | ||||
{arg.bias, dtype::Float32()}, {}, dst_layout); | {arg.bias, dtype::Float32()}, {}, dst_layout); | ||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||||
arg.filter[2] * arg.filter[3] * 2.0 / | |||||
(1024 * 1024 * 1024) * 1e3; | |||||
float computations = 0.0; | |||||
if (io_pack_size == 1) { | |||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] * | |||||
arg.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3; | |||||
} else { | |||||
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2 | |||||
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] * | |||||
arg.filter[3] * arg.filter[4] * 2.0 / (1024 * 1024 * 1024) * | |||||
1e3; | |||||
} | |||||
param::Convolution conv_param; | param::Convolution conv_param; | ||||
conv_param.pad_h = arg.param.pad_h; | conv_param.pad_h = arg.param.pad_h; | ||||
@@ -999,9 +1017,9 @@ void benchmark_winograd( | |||||
// usage of weight pre-processing for winograd benchmark | // usage of weight pre-processing for winograd benchmark | ||||
void benchmark_winograd_weight_preprocess( | void benchmark_winograd_weight_preprocess( | ||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||||
size_t pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, size_t pack_size, | |||||
size_t io_pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size, io_pack_size); | |||||
using namespace conv_bias; | using namespace conv_bias; | ||||
constexpr size_t RUN = 10; | constexpr size_t RUN = 10; | ||||
@@ -1018,16 +1036,17 @@ void benchmark_winograd_weight_preprocess( | |||||
opr->deduce_layout( | opr->deduce_layout( | ||||
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | ||||
{arg.bias, dtype::Float32()}, {}, dst_layout); | {arg.bias, dtype::Float32()}, {}, dst_layout); | ||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||||
arg.filter[2] * arg.filter[3] * 2.0 / | |||||
(1024 * 1024 * 1024) * 1e3; | |||||
param::Convolution conv_param; | |||||
conv_param.pad_h = arg.param.pad_h; | |||||
conv_param.pad_w = arg.param.pad_w; | |||||
conv_param.stride_h = arg.param.stride_h; | |||||
conv_param.stride_w = arg.param.stride_w; | |||||
float computations = 0.0; | |||||
if (io_pack_size == 1) { | |||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] * | |||||
arg.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3; | |||||
} else { | |||||
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2 | |||||
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] * | |||||
arg.filter[3] * arg.filter[4] * 2.0 / (1024 * 1024 * 1024) * | |||||
1e3; | |||||
} | |||||
benchmark_winograd.set_param(arg.param); | benchmark_winograd.set_param(arg.param); | ||||
auto used_winograd = | auto used_winograd = | ||||
@@ -1045,8 +1064,8 @@ void benchmark_winograd_weight_preprocess( | |||||
void benchmark_winograd_compare( | void benchmark_winograd_compare( | ||||
const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, | const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, | ||||
size_t kernel, size_t pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||||
size_t kernel, size_t pack_size, size_t io_pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size, io_pack_size); | |||||
using namespace conv_bias; | using namespace conv_bias; | ||||
constexpr size_t RUN = 10; | constexpr size_t RUN = 10; | ||||
@@ -1062,16 +1081,17 @@ void benchmark_winograd_compare( | |||||
opr->deduce_layout( | opr->deduce_layout( | ||||
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | ||||
{arg.bias, dtype::Float32()}, {}, dst_layout); | {arg.bias, dtype::Float32()}, {}, dst_layout); | ||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||||
arg.filter[2] * arg.filter[3] * 2.0 / | |||||
(1024 * 1024 * 1024) * 1e3; | |||||
param::Convolution conv_param; | |||||
conv_param.pad_h = arg.param.pad_h; | |||||
conv_param.pad_w = arg.param.pad_w; | |||||
conv_param.stride_h = arg.param.stride_h; | |||||
conv_param.stride_w = arg.param.stride_w; | |||||
float computations = 0.0; | |||||
if (io_pack_size == 1) { | |||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] * | |||||
arg.filter[3] * 2.0 / (1024 * 1024 * 1024) * 1e3; | |||||
} else { | |||||
//! dst.nr_elems * IC/4 * FH * FW * 4 * 2 | |||||
computations = dst_layout.total_nr_elems() * arg.filter[1] * arg.filter[2] * | |||||
arg.filter[3] * arg.filter[4] * 2.0 / (1024 * 1024 * 1024) * | |||||
1e3; | |||||
} | |||||
benchmark_winograd.set_param(arg.param); | benchmark_winograd.set_param(arg.param); | ||||
auto used_winograd1 = | auto used_winograd1 = | ||||
@@ -62,16 +62,16 @@ void check_conv_bias( | |||||
#if MEGDNN_WITH_BENCHMARK | #if MEGDNN_WITH_BENCHMARK | ||||
std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | ||||
size_t kernel, size_t pack_size = 1); | |||||
size_t kernel, size_t pack_size = 1, size_t io_pack_size = 1); | |||||
void benchmark_winograd( | void benchmark_winograd( | ||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | const char* algo_name, megdnn::Handle* handle, size_t kernel, | ||||
size_t pack_size = 1); | |||||
size_t pack_size = 1, size_t io_pack_size = 1); | |||||
void benchmark_winograd_weight_preprocess( | void benchmark_winograd_weight_preprocess( | ||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | const char* algo_name, megdnn::Handle* handle, size_t kernel, | ||||
size_t pack_size = 1); | |||||
size_t pack_size = 1, size_t io_pack_size = 1); | |||||
void benchmark_winograd_compare( | void benchmark_winograd_compare( | ||||
const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, | const char* algoA_name, const char* algoB_name, megdnn::Handle* handle, | ||||
size_t kernel, size_t pack_size = 1); | |||||
size_t kernel, size_t pack_size = 1, size_t io_pack_size = 1); | |||||
#endif // MEGDNN_WITH_BENCHMARK | #endif // MEGDNN_WITH_BENCHMARK | ||||
template <class Checker> | template <class Checker> | ||||
void check_winograd( | void check_winograd( | ||||
@@ -597,6 +597,25 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F63_4_NCHW44) { | |||||
param::ConvBias::Format::NCHW44); | param::ConvBias::Format::NCHW44); | ||||
} | } | ||||
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F43_4_NCHW44) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args = | |||||
get_nchw44_conv_bias_args({3}, QUAN_NLMODE, BR_AND_NO_BIASMODE, 1); | |||||
Checker<ConvBiasForward> checker(handle()); | |||||
check_winograd( | |||||
"4:4:16", checker, args, param::MatrixMul::Format::MK4, | |||||
param::ConvBias::Format::NCHW44); | |||||
} | |||||
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F43_4_WEIGHT_PREPROCESS) { | |||||
using namespace conv_bias; | |||||
std::vector<TestArg> args = get_winograd_mk_packed_args(); | |||||
Checker<ConvBiasForward, OprWeightPreprocessProxy<ConvBiasForward>> checker( | |||||
handle()); | |||||
check_winograd("4:4:16", checker, args, param::MatrixMul::Format::MK4); | |||||
} | |||||
TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) { | TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_WINOGRAD_F54) { | ||||
using namespace conv_bias; | using namespace conv_bias; | ||||
std::vector<TestArg> args = get_winograd_args(4); | std::vector<TestArg> args = get_winograd_args(4); | ||||