Browse Source

fix(dnn): fix reduce sum/mean error when b is large

GitOrigin-RevId: d1bae619b1
HuaHua404-patch-4
Megvii Engine Team 2 years ago
parent
commit
1b94380794
4 changed files with 410 additions and 41 deletions
  1. +63
    -36
      dnn/src/fallback/reduce/opr_impl.cpp
  2. +3
    -0
      dnn/src/fallback/reduce/opr_impl.h
  3. +272
    -5
      dnn/src/fallback/reduce/reducer.h
  4. +72
    -0
      dnn/test/fallback/reduce.cpp

+ 63
- 36
dnn/src/fallback/reduce/opr_impl.cpp View File

@@ -5,7 +5,6 @@

#include "midout.h"
#include "reducer.h"
#include "src/common/reduce_helper.h"

MIDOUT_DECL(megdnn_fb_reduce_op)
MIDOUT_DECL(megdnn_fb_reduce_c)
@@ -67,6 +66,27 @@ void reduce_exec(size_t A, size_t B, size_t C, Op op) MEGDNN_NOEXCEPT {
namespace megdnn {
namespace fallback {

size_t ReduceImpl::get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) {
MEGDNN_MARK_USED_VAR(src);
MEGDNN_MARK_USED_VAR(dst);

if (src.dtype.enumv() == DTypeEnum::Float32 &&
(param().mode == Mode::MEAN || param().mode == Mode::SUM ||
param().mode == Mode::SUM_SQR)) {
size_t A, B, C;
reduce::get_ABC(src, A, B, C, param().axis);
if (C == 1) {
// Using B = 247 as an example, you can understand why these parameters exist
size_t _60xT_in_4 = (60 * 3) / 4; // T = 3
size_t _60xX_in_4 = 4; // 0 < X < T, X = 1,2.
size_t _XXxT_in_4 = 4;
return ((B / _60xT_in_4 + _60xX_in_4 + _XXxT_in_4) * sizeof(float));
}
}
return naive::ReduceForwardImpl::get_workspace_in_bytes(src, dst);
}

void ReduceImpl::exec(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
check_exec(src.layout, dst.layout, workspace.size);
@@ -178,45 +198,52 @@ void ReduceImpl::exec_fallback(
}

bool ReduceImpl::exec_optimized(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace workspace) {
size_t A, B, C;
reduce::get_ABC(src.layout, A, B, C, param().axis);
bool execed = false;
using Mode = param::Reduce::Mode;
#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void(const ctype*, ctype*, DType, size_t, size_t, size_t)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C)); \
execed = true; \
} \
MIDOUT_END(); \

#define DISPATCH_FUNC(Reducer, dtype, ctype, comp_type) \
if (C == 1) { \
using _Reducer = Reducer<dtype, ctype, comp_type, true>; \
using _ReducerC1SmallB = Reducer<dtype, ctype, comp_type, false>; \
std::function<void( \
const ctype*, ctype*, DType, size_t, size_t, size_t, \
_megdnn_workspace)> \
do_reduce = Exec<_Reducer, true>::do_reduce; \
if (B == 2) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 2>::do_reduce; \
if (B == 3) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 3>::do_reduce; \
if (B == 4) \
do_reduce = ExecC1SmallB<_ReducerC1SmallB, ctype, 4>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(0)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
workspace)); \
execed = true; \
} \
MIDOUT_END(); \
} else { \
using _Reducer = Reducer<dtype, ctype, comp_type, false>; \
std::function<void( \
const ctype*, ctype*, DType, size_t, size_t, size_t, \
_megdnn_workspace)> \
do_reduce = Exec<_Reducer, false>::do_reduce; \
MIDOUT_BEGIN( \
megdnn_fallback_reduce_optimized, ctype, dtype, comp_type, \
midout_iv(1)) { \
MEGDNN_DISPATCH_CPU_KERN_OPR(do_reduce( \
reinterpret_cast<ctype*>(src.raw_ptr()), \
reinterpret_cast<ctype*>(dst.raw_ptr()), src_type, A, B, C, \
workspace)); \
execed = true; \
} \
MIDOUT_END(); \
}

#define DISPATCH_MODE_QUANTIZED(dtype, ctype, comp_type) \


+ 3
- 0
dnn/src/fallback/reduce/opr_impl.h View File

@@ -1,4 +1,5 @@
#pragma once
#include "src/common/reduce_helper.h"
#include "src/naive/reduce/opr_impl.h"

namespace megdnn {
@@ -13,6 +14,8 @@ public:
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
void exec_fallback(
_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace);
size_t get_workspace_in_bytes(
const TensorLayout& src, const TensorLayout& dst) override;
};

} // namespace fallback


+ 272
- 5
dnn/src/fallback/reduce/reducer.h View File

@@ -1,5 +1,6 @@
#pragma once

#include "src/common/unroll_macro.h"
#include "src/common/utils.h"
#include "src/fallback/general_intrinsic/gi_float.h"
#include "src/fallback/general_intrinsic/gi_int.h"
@@ -395,14 +396,14 @@ template <typename Reducer, bool C1>
struct Exec {
static void do_reduce(
const typename Reducer::ctype* src, typename Reducer::ctype* dst,
DType src_dtype, size_t A, size_t B, size_t C);
DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace);
};

template <typename Reducer>
struct Exec<Reducer, true> {
static void do_reduce(
const typename Reducer::ctype* src, typename Reducer::ctype* dst,
DType src_dtype, size_t A, size_t B, size_t) {
DType src_dtype, size_t A, size_t B, size_t, _megdnn_workspace) {
size_t a = 0;
for (; a < A; a++) {
Reducer reducer0(src_dtype, B);
@@ -426,7 +427,7 @@ template <typename Reducer>
struct Exec<Reducer, false> {
static void do_reduce(
const typename Reducer::ctype* src, typename Reducer::ctype* dst,
DType src_dtype, size_t A, size_t B, size_t C) {
DType src_dtype, size_t A, size_t B, size_t C, _megdnn_workspace) {
for (size_t a = 0; a < A; a++) {
size_t c = 0;
for (; c + Reducer::SIMD_WIDTH <= C; c += Reducer::SIMD_WIDTH) {
@@ -448,10 +449,276 @@ struct Exec<Reducer, false> {
}
};

// function kern_4x15xT()
// 1. Loop the calculation with SIMD_WIDTH x 15 x T as a set of data
// 2. T affects accuracy, i.e. SIMD_ Width x 15 x T data accumulated into SIMD_ Width
// data 3.D0-d14 is used for reading, then bisection and addition, the addition result
// is stored in D15, and D15 is written once in T cycles

// function kern_4xXXx1()
// Enter this function when the remaining number is less than 60
// 1. The first switch is to gather the redundant numbers at the end into a vector,
// which can be processed in vector units in subsequent processes
// 2. The second switch loads multiple vectors
// 3. The third switch, binary calculation, results in a vector
#define ImplementC1LargeB(rd_type, coef, case_load, load, for_shift, cal_final_res) \
template <> \
struct Exec<rd_type##Reducer<dt_float32, float, float, true>, true> { \
using rd_type##Reducer_ = rd_type##Reducer<dt_float32, float, float, true>; \
static constexpr int SIMD_WIDTH = GI_SIMD_LEN_BYTE / sizeof(float); \
static constexpr int VREG_NUM = 16; \
static void kern_4x15xT( \
const float* read_ptr, size_t& read_idx, float* write_ptr, \
size_t& write_idx, size_t remain_size, size_t T) { \
GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, \
d14, d15; \
constexpr size_t STEP = SIMD_WIDTH * (VREG_NUM - 1); \
while (read_idx + STEP <= remain_size) { \
d15 = GiBroadcastFloat32(0.0); \
size_t i = 0; \
for (; read_idx + STEP <= remain_size && i < T; \
read_idx += STEP, i++) { \
const float* _read_ptr = read_ptr + read_idx; \
UNROLL_CALL_RAW(15, load, _read_ptr, read_ptr, write_ptr) \
d0 = GiAddFloat32(d0, d1); \
d2 = GiAddFloat32(d2, d3); \
d4 = GiAddFloat32(d4, d5); \
d6 = GiAddFloat32(d6, d7); \
d8 = GiAddFloat32(d8, d9); \
d10 = GiAddFloat32(d10, d11); \
d12 = GiAddFloat32(d12, d13); \
d0 = GiAddFloat32(d0, d2); \
d4 = GiAddFloat32(d4, d6); \
d8 = GiAddFloat32(d8, d10); \
d12 = GiAddFloat32(d12, d14); \
d0 = GiAddFloat32(d0, d4); \
d8 = GiAddFloat32(d8, d12); \
d0 = GiAddFloat32(d0, d8); \
d15 = GiAddFloat32(d0, d15); \
} \
GiStoreFloat32(write_ptr + write_idx, d15); \
write_idx += SIMD_WIDTH; \
} \
} \
static void kern_4xXXx1( \
const float* read_ptr, size_t& read_idx, float* write_ptr, \
size_t& write_idx, size_t remain_size) { \
size_t block_num = remain_size / SIMD_WIDTH; \
size_t tail_num = remain_size % SIMD_WIDTH; \
if (block_num == 0) { \
for_shift(read_ptr, read_idx, write_ptr, write_idx, tail_num); \
write_idx += tail_num; \
} else { \
GI_FLOAT32_t d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, \
d13, d14, d15; \
float buf[4]; \
switch (tail_num) { \
case 3: \
buf[0] = read_ptr[read_idx + remain_size - 1]; \
buf[1] = read_ptr[read_idx + remain_size - 2]; \
buf[2] = read_ptr[read_idx + remain_size - 3]; \
buf[3] = 0; \
load(0, buf, read_ptr, write_ptr); \
break; \
case 2: \
buf[0] = read_ptr[read_idx + remain_size - 1]; \
buf[1] = read_ptr[read_idx + remain_size - 2]; \
buf[2] = 0; \
buf[3] = 0; \
load(0, buf, read_ptr, write_ptr); \
break; \
case 1: \
buf[0] = read_ptr[read_idx + remain_size - 1]; \
buf[1] = 0; \
buf[2] = 0; \
buf[3] = 0; \
load(0, buf, read_ptr, write_ptr); \
break; \
default: \
d0 = GiBroadcastFloat32(0.0); \
break; \
} \
d15 = d0; \
remain_size -= tail_num; \
const float* _read_ptr = read_ptr + read_idx; \
switch (block_num) { \
case_load(15, _read_ptr, 14, read_ptr, write_ptr); \
case_load(14, _read_ptr, 13, read_ptr, write_ptr); \
case_load(13, _read_ptr, 12, read_ptr, write_ptr); \
case_load(12, _read_ptr, 11, read_ptr, write_ptr); \
case_load(11, _read_ptr, 10, read_ptr, write_ptr); \
case_load(10, _read_ptr, 9, read_ptr, write_ptr); \
case_load(9, _read_ptr, 8, read_ptr, write_ptr); \
case_load(8, _read_ptr, 7, read_ptr, write_ptr); \
case_load(7, _read_ptr, 6, read_ptr, write_ptr); \
case_load(6, _read_ptr, 5, read_ptr, write_ptr); \
case_load(5, _read_ptr, 4, read_ptr, write_ptr); \
case_load(4, _read_ptr, 3, read_ptr, write_ptr); \
case_load(3, _read_ptr, 2, read_ptr, write_ptr); \
case_load(2, _read_ptr, 1, read_ptr, write_ptr); \
case_load(1, _read_ptr, 0, read_ptr, write_ptr); \
default: \
break; \
} \
d0 = GiAddFloat32(d0, d15); \
while (block_num > 1) { \
switch (block_num) { \
case 15: \
case 14: \
d0 = GiAddFloat32(d0, d1); \
d1 = GiAddFloat32(d2, d3); \
d2 = GiAddFloat32(d4, d5); \
d3 = GiAddFloat32(d6, d7); \
d4 = GiAddFloat32(d8, d9); \
d5 = GiAddFloat32(d10, d11); \
d6 = GiAddFloat32(d12, d13); \
if (block_num & 1) \
d7 = d14; \
break; \
case 13: \
case 12: \
d0 = GiAddFloat32(d0, d1); \
d1 = GiAddFloat32(d2, d3); \
d2 = GiAddFloat32(d4, d5); \
d3 = GiAddFloat32(d6, d7); \
d4 = GiAddFloat32(d8, d9); \
d5 = GiAddFloat32(d10, d11); \
if (block_num & 1) \
d6 = d12; \
break; \
case 11: \
case 10: \
d0 = GiAddFloat32(d0, d1); \
d1 = GiAddFloat32(d2, d3); \
d2 = GiAddFloat32(d4, d5); \
d3 = GiAddFloat32(d6, d7); \
d4 = GiAddFloat32(d8, d9); \
if (block_num & 1) \
d5 = d10; \
break; \
case 9: \
case 8: \
d0 = GiAddFloat32(d0, d1); \
d1 = GiAddFloat32(d2, d3); \
d2 = GiAddFloat32(d4, d5); \
d3 = GiAddFloat32(d6, d7); \
if (block_num & 1) \
d4 = d8; \
break; \
case 7: \
case 6: \
d0 = GiAddFloat32(d0, d1); \
d1 = GiAddFloat32(d2, d3); \
d2 = GiAddFloat32(d4, d5); \
if (block_num & 1) \
d3 = d6; \
break; \
case 5: \
case 4: \
d0 = GiAddFloat32(d0, d1); \
d1 = GiAddFloat32(d2, d3); \
if (block_num & 1) \
d2 = d4; \
break; \
case 3: \
case 2: \
d0 = GiAddFloat32(d0, d1); \
if (block_num & 1) \
d1 = d2; \
default: \
break; \
} \
block_num = (block_num + 1) / 2; \
} \
GiStoreFloat32(write_ptr + write_idx, d0); \
write_idx += SIMD_WIDTH; \
} \
} \
static void do_reduce( \
const float* src, float* dst, DType src_dtype, size_t A, size_t B, \
size_t, _megdnn_workspace workspace) { \
MEGDNN_MARK_USED_VAR(src_dtype); \
float* workspace_ptr = workspace.raw_ptr->as<float>(); \
constexpr size_t T = 3; \
for (size_t a = 0; a < A; a++) { \
size_t remain_size = B; \
const float* read_ptr = src + a * B; \
float* write_ptr = workspace_ptr; \
while (remain_size > SIMD_WIDTH) { \
size_t read_idx = 0; \
size_t write_idx = 0; \
kern_4x15xT( \
read_ptr, read_idx, write_ptr, write_idx, remain_size, T); \
kern_4xXXx1( \
read_ptr, read_idx, write_ptr, write_idx, \
remain_size - read_idx); \
remain_size = write_idx; \
read_ptr = workspace_ptr; \
} \
cal_final_res(remain_size, read_ptr, write_ptr, dst, coef); \
dst++; \
} \
} \
};

#define GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR) \
d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT);
#define GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR) \
d##SHIFT = GiLoadFloat32((PTR) + SIMD_WIDTH * SHIFT); \
if (RD_PTR != WR_PTR) \
d##SHIFT = GiMultiplyFloat32(d##SHIFT, d##SHIFT);

#define CASE_GI_LOAD(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \
case NUM: \
GI_LOAD(SHIFT, PTR, RD_PTR, WR_PTR) \
MEGDNN_FALLTHRU
#define CASE_GI_LOAD_THEN_MULT(NUM, PTR, SHIFT, RD_PTR, WR_PTR) \
case NUM: \
GI_LOAD_THEN_MULT(SHIFT, PTR, RD_PTR, WR_PTR) \
MEGDNN_FALLTHRU

#define FOR_MEAN_AND_SUM(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num) \
for (size_t i = 0; i < tail_num; i++) \
wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i];
#define FOR_SUM_SQUARE(rd_ptr, rd_idx, wr_ptr, wr_idx, tail_num) \
if (rd_ptr != wr_ptr) \
for (size_t i = 0; i < tail_num; i++) \
wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i] * rd_ptr[rd_idx + i]; \
else \
for (size_t i = 0; i < tail_num; i++) \
wr_ptr[wr_idx + i] = rd_ptr[rd_idx + i];

#define CAL_FINAL_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \
float val = 0; \
if (write_ptr != read_ptr) \
for (size_t i = 0; i < remain_size; i++) \
val = val + read_ptr[i]; \
else \
for (size_t i = 0; i < remain_size; i++) \
val = val + write_ptr[i]; \
*dst_ptr = val * coef;
#define CAL_FINAL_SQUARE_RESULT(remain_size, read_ptr, write_ptr, dst_ptr, coef) \
float val = 0; \
if (write_ptr != read_ptr) \
for (size_t i = 0; i < remain_size; i++) \
val = val + read_ptr[i] * read_ptr[i]; \
else \
for (size_t i = 0; i < remain_size; i++) \
val = val + write_ptr[i]; \
*dst_ptr = val * coef;

ImplementC1LargeB(
Mean, 1 / B, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT);
ImplementC1LargeB(Sum, 1, CASE_GI_LOAD, GI_LOAD, FOR_MEAN_AND_SUM, CAL_FINAL_RESULT);
ImplementC1LargeB(
SumSqr, 1, CASE_GI_LOAD_THEN_MULT, GI_LOAD_THEN_MULT, FOR_SUM_SQUARE,
CAL_FINAL_SQUARE_RESULT);

template <typename Reducer, typename dtype, size_t B>
struct ExecC1SmallB {
static void do_reduce(
const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C);
const dtype* src, dtype* dst, DType src_dtype, size_t A, size_t, size_t C,
_megdnn_workspace);
};

#define ImplementC1SmallB(_ctype, _gi_type, _gi_ins) \
@@ -459,7 +726,7 @@ struct ExecC1SmallB {
struct ExecC1SmallB<Reducer, _ctype, B> { \
static void do_reduce( \
const _ctype* src, _ctype* dst, DType src_dtype, size_t A, size_t, \
size_t) { \
size_t, _megdnn_workspace) { \
size_t a = 0; \
for (; a + Reducer::SIMD_WIDTH < A; a += Reducer::SIMD_WIDTH) { \
Reducer reducer(src_dtype, B); \


+ 72
- 0
dnn/test/fallback/reduce.cpp View File

@@ -352,6 +352,78 @@ TEST_F(FALLBACK, BENCHMARK_REDUCE_VS_CONV) {
};
run();
}

TEST_F(FALLBACK, BENCHMARK_REDUCE) {
auto run = [&]() {
Benchmarker<Reduce> benchmarker_reduce(handle());
benchmarker_reduce.set_display(false);
using Mode = param::Reduce::Mode;

constexpr size_t RUNS = 100;
benchmarker_reduce.set_times(RUNS);

TensorShape small{3 * 224 * 224};
TensorShape large{3 * 224 * 224 * 100};
param::Reduce param;
param.axis = 0;

for (auto i = 224; i < 224 * 2; i++) {
for (auto mode : {Mode::SUM, Mode::MEAN, Mode::SUM_SQR}) {
param.mode = mode;
benchmarker_reduce.set_param(param);
auto reduce = benchmarker_reduce.execs({{3 * 224 * i}, {}}) / RUNS;
}
}
param.mode = param::Reduce::Mode::SUM;
benchmarker_reduce.set_param(param);
printf("SUM\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

printf("case 1: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

printf("case 1: reduce use time %fms\n", reduce);
}

param.mode = param::Reduce::Mode::MEAN;
benchmarker_reduce.set_param(param);
printf("MEAN\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

printf("case 2: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

printf("case 2: reduce use time %fms\n", reduce);
}

param.mode = param::Reduce::Mode::SUM_SQR;
benchmarker_reduce.set_param(param);
printf("SUM_SQR\n");
{
TensorLayout src(small, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

printf("case 3: reduce use time %fms\n", reduce);
}
{
TensorLayout src(large, dtype::Float32());
auto reduce = benchmarker_reduce.execs({src, {}}) / RUNS;

printf("case 3: reduce use time %fms\n", reduce);
}
};
run();
}
#endif

// vim: syntax=cpp.doxygen

Loading…
Cancel
Save