Browse Source

feat(arm): supports weight pre-processing for winograd benchmark tests

GitOrigin-RevId: 1797f3b91c
dev-support-lite-fork-debug-mode
Megvii Engine Team 2 years ago
parent
commit
2b99bfec4e
5 changed files with 137 additions and 21 deletions
  1. +18
    -17
      dnn/test/common/benchmarker.h
  2. +46
    -0
      dnn/test/common/conv_bias.cpp
  3. +3
    -0
      dnn/test/common/conv_bias.h
  4. +69
    -4
      dnn/test/common/opr_proxy.h
  5. +1
    -0
      dnn/test/common/topk.h

+ 18
- 17
dnn/test/common/benchmarker.h View File

@@ -14,7 +14,7 @@
namespace megdnn {
namespace test {

template <typename Opr, typename T>
template <typename Opr, typename T, typename Proxy = OprProxy<Opr>>
class BenchmarkerBase {
public:
using Param = typename Opr::Param;
@@ -28,7 +28,7 @@ public:
m_handle(handle),
m_default_rng(new NormalRNG()),
m_param(Param()),
m_proxy{new OprProxy<Opr>()} {}
m_proxy{new Proxy()} {}

const Handle* handle() const { return m_handle; }

@@ -81,12 +81,12 @@ public:
}
return layouts;
}
BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
BenchmarkerBase& set_proxy(std::unique_ptr<Proxy>& proxy) {
m_proxy.reset(nullptr);
m_proxy = std::move(proxy);
return *this;
}
std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
std::unique_ptr<Proxy>& proxy() { return m_proxy; }
BenchmarkerBase& set_times(size_t times) {
m_times = times;
return *this;
@@ -135,14 +135,14 @@ private:
std::map<size_t, DType> m_dtype;
std::map<size_t, TensorFormat> m_fmt;
Param m_param;
std::unique_ptr<OprProxy<Opr>> m_proxy;
std::unique_ptr<Proxy> m_proxy;
BeforeExecCallback m_before_exec_callback;
std::unique_ptr<Opr> m_opr;
TensorsConstriant m_tensor_constraint;
};

template <typename Opr, typename T>
float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
template <typename Opr, typename T, typename OprProxy>
float BenchmarkerBase<Opr, T, OprProxy>::exec(TensorLayoutArray layouts) {
auto opr = this->opr();
opr->param() = m_param;
auto user_layouts = layouts;
@@ -196,6 +196,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
if (m_before_exec_callback) {
m_before_exec_callback(opr, tensors_cur);
}
//! init weights
m_proxy->init(opr, tensors_cur);
// run
// warm up
m_proxy->exec(opr, tensors_cur);
@@ -246,8 +248,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
return time_in_ms;
}

template <typename Opr, typename T>
float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
template <typename Opr, typename T, typename Proxy>
float BenchmarkerBase<Opr, T, Proxy>::exect(const TensorValueArray& testcase_in) {
auto opr = this->opr();
opr->param() = m_param;
TensorLayoutArray layouts;
@@ -295,6 +297,8 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
if (m_before_exec_callback) {
m_before_exec_callback(opr, tensors_cur);
}
//! init weights
m_proxy->init(opr, tensors_cur);
//! run
//! warm up
m_proxy->exec(opr, tensors_cur);
@@ -344,19 +348,16 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
return time_in_ms;
}

template <typename Opr, typename T = Timer>
class Benchmarker;

template <typename Opr>
class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
template <typename Opr, typename T = Timer, typename Proxy = OprProxy<Opr>>
class Benchmarker : public BenchmarkerBase<Opr, T, Proxy> {
public:
Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
Benchmarker(Handle* handle) : BenchmarkerBase<Opr, T, Proxy>{handle, Timer{}} {}
};

////////////////// Algo Benchmark ////////////////////////
template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
float algo_benchmark(
Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
Benchmarker<Opr, T, Proxy>& benchmark, TensorLayoutArray layouts,
const std::string& algo_base) {
Proxy proxy;
auto opr = benchmark.opr();
@@ -381,7 +382,7 @@ float algo_benchmark(

template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
float algo_benchmark(
Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
Benchmarker<Opr, T, Proxy>& benchmark, TensorShapeArray shapes,
const std::string& algo_base) {
return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
}


+ 46
- 0
dnn/test/common/conv_bias.cpp View File

@@ -995,6 +995,52 @@ void benchmark_winograd(
used / used_winograd);
}
}

// usage of weight pre-processing for winograd benchmark
void benchmark_winograd_weight_preprocess(
const char* algo_name, megdnn::Handle* handle, size_t kernel,
size_t pack_size) {
auto&& args = get_winograd_benchmark_args(kernel, pack_size);
using namespace conv_bias;
constexpr size_t RUN = 10;

//! here!!!
Benchmarker<ConvBias, Timer, OprWeightPreprocessBenchmarkProxy<ConvBias>>
benchmark_winograd(handle);
benchmark_winograd.set_display(false);
benchmark_winograd.set_times(RUN);

for (auto&& arg : args) {
TensorLayout dst_layout;
auto opr = handle->create_operator<ConvBias>();
opr->param() = arg.param;
opr->deduce_layout(
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()},
{arg.bias, dtype::Float32()}, {}, dst_layout);
//! dst.nr_elems * IC * FH * FW * 2
float computations = dst_layout.total_nr_elems() * arg.filter[1] *
arg.filter[2] * arg.filter[3] * 2.0 /
(1024 * 1024 * 1024) * 1e3;

param::Convolution conv_param;
conv_param.pad_h = arg.param.pad_h;
conv_param.pad_w = arg.param.pad_w;
conv_param.stride_h = arg.param.stride_h;
conv_param.stride_w = arg.param.stride_w;

benchmark_winograd.set_param(arg.param);
auto used_winograd =
algo_benchmark<
ConvBias, OprWeightPreprocessBenchmarkProxy<ConvBias>, Timer>(
benchmark_winograd, {arg.src, arg.filter, {}, {}, {}},
algo_name) /
RUN;

printf("%s %s: %s: %f ms %f Gflops\n", arg.src.to_string().c_str(),
arg.filter.to_string().c_str(), algo_name, used_winograd,
computations / used_winograd);
}
}
#endif // MEGDNN_WITH_BENCHMARK

template <class Checker>


+ 3
- 0
dnn/test/common/conv_bias.h View File

@@ -66,6 +66,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args(
void benchmark_winograd(
const char* algo_name, megdnn::Handle* handle, size_t kernel,
size_t pack_size = 1);
void benchmark_winograd_weight_preprocess(
const char* algo_name, megdnn::Handle* handle, size_t kernel,
size_t pack_size = 1);
#endif // MEGDNN_WITH_BENCHMARK
template <class Checker>
void check_winograd(


+ 69
- 4
dnn/test/common/opr_proxy.h View File

@@ -114,7 +114,10 @@ template <
bool has_workspace = OprTrait<Opr>::has_workspace,
bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout>
struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>,
public ExecProxy<Opr, arity, has_workspace> {};
public ExecProxy<Opr, arity, has_workspace> {
virtual void init(Opr*, const TensorNDArray&) {}
virtual ~OprProxyDefaultImpl() {}
};

template <typename Opr>
struct OprProxy : public OprProxyDefaultImpl<Opr> {};
@@ -123,6 +126,9 @@ template <typename Opr>
struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {};

template <typename Opr>
struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl<Opr> {};

template <typename Opr>
struct OprProxyVectorToSingle {};

template <>
@@ -134,6 +140,8 @@ struct OprProxy<ElemwiseForward> {
opr->deduce_layout(inp, layouts.back());
}

static void init(ElemwiseForward*, const TensorNDArray&) {}

static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() >= 2);
auto inp = tensors;
@@ -151,6 +159,8 @@ struct OprProxy<ElemwiseMultiType> {
opr->deduce_layout(inp, layouts.back());
}

static void init(ElemwiseMultiType*, const TensorNDArray&) {}

static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() >= 2);
auto inp = tensors;
@@ -169,6 +179,8 @@ struct OprProxy<ConcatForward> {
opr->deduce_layout(inp, layouts.back());
}

static void init(ConcatForward*, const TensorNDArray&) {}

void exec(ConcatForward* opr, const TensorNDArray& tensors) {
if (!W.valid()) {
W = WorkspaceWrapper(opr->handle(), 0);
@@ -200,6 +212,8 @@ struct OprProxy<CheckNonFinite> {
opr->deduce_layout(inp, layouts.back());
}

static void init(CheckNonFinite*, const TensorNDArray&) {}

static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() >= 2);
auto inps = tensors;
@@ -220,6 +234,9 @@ struct OprProxy<CheckNonFinite> {
template <>
struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> {
WorkspaceWrapper W;

void init(SplitForward*, const TensorNDArray&) {}

void exec(SplitForward* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() >= 2);
if (!W.valid()) {
@@ -428,7 +445,9 @@ struct OprProxyProfilingBase
best_algo);
}

void exec(Opr* opr, const TensorNDArray& tensors) {
virtual void init(Opr*, const TensorNDArray&) {}

virtual void exec(Opr* opr, const TensorNDArray& tensors) {
megdnn_assert(tensors.size() == arity);
if (!W.valid()) {
W = WorkspaceWrapper(opr->handle(), 0);
@@ -463,6 +482,8 @@ struct OprProxyProfilingBase
}
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
}

virtual ~OprProxyProfilingBase() {}
};

#define DEF_PROF(c) \
@@ -491,7 +512,7 @@ template <class Opr>
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> {
using Base = OprProxyProfilingBase<Opr>;
static constexpr int arity = OprTrait<Opr>::arity;
void exec(Opr* opr, const TensorNDArray& tensors) {
void exec(Opr* opr, const TensorNDArray& tensors) override {
megdnn_assert(tensors.size() == arity);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
@@ -584,11 +605,55 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> {
}
};

template <class Opr>
struct OprWeightPreprocessProxyBenchmarkImpl
: public OprWeightPreprocessProxyImpl<Opr> {
using Base = OprProxyProfilingBase<Opr>;
static constexpr int arity = OprTrait<Opr>::arity;
void init(Opr* opr, const TensorNDArray& tensors) override {
megdnn_assert(tensors.size() == arity);
if (!Base::W.valid()) {
Base::W = WorkspaceWrapper(opr->handle(), 0);
}
TensorLayoutArray layouts;
for (auto&& tensor : tensors) {
layouts.push_back(tensor.layout);
}
m_preprocessed_tensors = this->weight_prerocess(
opr, tensors, Base::target_execution_policy.algo);
megcoreSynchronize(opr->handle()->megcore_computing_handle());
typename Opr::PreprocessedFilter preprocessed_filter{
nullptr, *m_preprocessed_tensors};
if (!Base::target_execution_policy.algo.valid()) {
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
opr, layouts, &preprocessed_filter);
Base::W.update(workspace_size);
}
}

void exec(Opr* opr, const TensorNDArray& tensors) override {
megdnn_assert(tensors.size() == arity);
typename Opr::PreprocessedFilter preprocessed_filter{
nullptr, *m_preprocessed_tensors};
AlgoProxy<Opr, arity>::exec(
opr, tensors, &preprocessed_filter, Base::W.workspace());
}

public:
std::shared_ptr<TensorNDArray> m_preprocessed_tensors;
};

#define DEF_PROF(c) \
template <> \
struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \
using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \
}
}; \
template <> \
struct OprWeightPreprocessBenchmarkProxy<c> \
: public OprWeightPreprocessProxyBenchmarkImpl<c> { \
using OprWeightPreprocessProxyBenchmarkImpl< \
c>::OprWeightPreprocessProxyBenchmarkImpl; \
};

DEF_PROF(ConvolutionForward);
DEF_PROF(ConvBias);


+ 1
- 0
dnn/test/common/topk.h View File

@@ -16,6 +16,7 @@ private:
public:
OprProxy() = default;
OprProxy(int k) : m_k{k} {}
void init(TopK*, const TensorLayoutArray&) {}

void deduce_layout(TopK* opr, TensorLayoutArray& layouts) {
if (layouts.size() == 3) {


Loading…
Cancel
Save