From 2b99bfec4e9409908a1b830a667c077f90dcbe62 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 15 Aug 2022 16:54:32 +0800 Subject: [PATCH] feat(arm): supports weight pre-processing for winograd benchmark tests GitOrigin-RevId: 1797f3b91cd9413cb1f247d2078dee37b5ece8bf --- dnn/test/common/benchmarker.h | 35 +++++++++++---------- dnn/test/common/conv_bias.cpp | 46 +++++++++++++++++++++++++++ dnn/test/common/conv_bias.h | 3 ++ dnn/test/common/opr_proxy.h | 73 ++++++++++++++++++++++++++++++++++++++++--- dnn/test/common/topk.h | 1 + 5 files changed, 137 insertions(+), 21 deletions(-) diff --git a/dnn/test/common/benchmarker.h b/dnn/test/common/benchmarker.h index 6449d9e3..81f87ef5 100644 --- a/dnn/test/common/benchmarker.h +++ b/dnn/test/common/benchmarker.h @@ -14,7 +14,7 @@ namespace megdnn { namespace test { -template +template > class BenchmarkerBase { public: using Param = typename Opr::Param; @@ -28,7 +28,7 @@ public: m_handle(handle), m_default_rng(new NormalRNG()), m_param(Param()), - m_proxy{new OprProxy()} {} + m_proxy{new Proxy()} {} const Handle* handle() const { return m_handle; } @@ -81,12 +81,12 @@ public: } return layouts; } - BenchmarkerBase& set_proxy(std::unique_ptr>& proxy) { + BenchmarkerBase& set_proxy(std::unique_ptr& proxy) { m_proxy.reset(nullptr); m_proxy = std::move(proxy); return *this; } - std::unique_ptr>& proxy() { return m_proxy; } + std::unique_ptr& proxy() { return m_proxy; } BenchmarkerBase& set_times(size_t times) { m_times = times; return *this; @@ -135,14 +135,14 @@ private: std::map m_dtype; std::map m_fmt; Param m_param; - std::unique_ptr> m_proxy; + std::unique_ptr m_proxy; BeforeExecCallback m_before_exec_callback; std::unique_ptr m_opr; TensorsConstriant m_tensor_constraint; }; -template -float BenchmarkerBase::exec(TensorLayoutArray layouts) { +template +float BenchmarkerBase::exec(TensorLayoutArray layouts) { auto opr = this->opr(); opr->param() = m_param; auto user_layouts = layouts; @@ -196,6 +196,8 @@ float BenchmarkerBase::exec(TensorLayoutArray layouts) { if (m_before_exec_callback) { m_before_exec_callback(opr, tensors_cur); } + //! init weights + m_proxy->init(opr, tensors_cur); // run // warm up m_proxy->exec(opr, tensors_cur); @@ -246,8 +248,8 @@ float BenchmarkerBase::exec(TensorLayoutArray layouts) { return time_in_ms; } -template -float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { +template +float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { auto opr = this->opr(); opr->param() = m_param; TensorLayoutArray layouts; @@ -295,6 +297,8 @@ float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { if (m_before_exec_callback) { m_before_exec_callback(opr, tensors_cur); } + //! init weights + m_proxy->init(opr, tensors_cur); //! run //! warm up m_proxy->exec(opr, tensors_cur); @@ -344,19 +348,16 @@ float BenchmarkerBase::exect(const TensorValueArray& testcase_in) { return time_in_ms; } -template -class Benchmarker; - -template -class Benchmarker : public BenchmarkerBase { +template > +class Benchmarker : public BenchmarkerBase { public: - Benchmarker(Handle* handle) : BenchmarkerBase{handle, Timer{}} {} + Benchmarker(Handle* handle) : BenchmarkerBase{handle, Timer{}} {} }; ////////////////// Algo Benchmark //////////////////////// template , typename T = Timer> float algo_benchmark( - Benchmarker& benchmark, TensorLayoutArray layouts, + Benchmarker& benchmark, TensorLayoutArray layouts, const std::string& algo_base) { Proxy proxy; auto opr = benchmark.opr(); @@ -381,7 +382,7 @@ float algo_benchmark( template , typename T = Timer> float algo_benchmark( - Benchmarker& benchmark, TensorShapeArray shapes, + Benchmarker& benchmark, TensorShapeArray shapes, const std::string& algo_base) { return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); } diff --git a/dnn/test/common/conv_bias.cpp b/dnn/test/common/conv_bias.cpp index 5cb32484..09cbcaff 100644 --- a/dnn/test/common/conv_bias.cpp +++ b/dnn/test/common/conv_bias.cpp @@ -995,6 +995,52 @@ void benchmark_winograd( used / used_winograd); } } + +// usage of weight pre-processing for winograd benchmark +void benchmark_winograd_weight_preprocess( + const char* algo_name, megdnn::Handle* handle, size_t kernel, + size_t pack_size) { + auto&& args = get_winograd_benchmark_args(kernel, pack_size); + using namespace conv_bias; + constexpr size_t RUN = 10; + + //! here!!! + Benchmarker> + benchmark_winograd(handle); + benchmark_winograd.set_display(false); + benchmark_winograd.set_times(RUN); + + for (auto&& arg : args) { + TensorLayout dst_layout; + auto opr = handle->create_operator(); + opr->param() = arg.param; + opr->deduce_layout( + {arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, + {arg.bias, dtype::Float32()}, {}, dst_layout); + //! dst.nr_elems * IC * FH * FW * 2 + float computations = dst_layout.total_nr_elems() * arg.filter[1] * + arg.filter[2] * arg.filter[3] * 2.0 / + (1024 * 1024 * 1024) * 1e3; + + param::Convolution conv_param; + conv_param.pad_h = arg.param.pad_h; + conv_param.pad_w = arg.param.pad_w; + conv_param.stride_h = arg.param.stride_h; + conv_param.stride_w = arg.param.stride_w; + + benchmark_winograd.set_param(arg.param); + auto used_winograd = + algo_benchmark< + ConvBias, OprWeightPreprocessBenchmarkProxy, Timer>( + benchmark_winograd, {arg.src, arg.filter, {}, {}, {}}, + algo_name) / + RUN; + + printf("%s %s: %s: %f ms %f Gflops\n", arg.src.to_string().c_str(), + arg.filter.to_string().c_str(), algo_name, used_winograd, + computations / used_winograd); + } +} #endif // MEGDNN_WITH_BENCHMARK template diff --git a/dnn/test/common/conv_bias.h b/dnn/test/common/conv_bias.h index 4a3c5fc3..d389f0da 100644 --- a/dnn/test/common/conv_bias.h +++ b/dnn/test/common/conv_bias.h @@ -66,6 +66,9 @@ std::vector get_winograd_benchmark_args( void benchmark_winograd( const char* algo_name, megdnn::Handle* handle, size_t kernel, size_t pack_size = 1); +void benchmark_winograd_weight_preprocess( + const char* algo_name, megdnn::Handle* handle, size_t kernel, + size_t pack_size = 1); #endif // MEGDNN_WITH_BENCHMARK template void check_winograd( diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index baecd215..09202a12 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -114,7 +114,10 @@ template < bool has_workspace = OprTrait::has_workspace, bool can_deduce_layout = OprTrait::can_deduce_layout> struct OprProxyDefaultImpl : public DeduceLayoutProxy, - public ExecProxy {}; + public ExecProxy { + virtual void init(Opr*, const TensorNDArray&) {} + virtual ~OprProxyDefaultImpl() {} +}; template struct OprProxy : public OprProxyDefaultImpl {}; @@ -123,6 +126,9 @@ template struct OprWeightPreprocessProxy : public OprProxyDefaultImpl {}; template +struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl {}; + +template struct OprProxyVectorToSingle {}; template <> @@ -134,6 +140,8 @@ struct OprProxy { opr->deduce_layout(inp, layouts.back()); } + static void init(ElemwiseForward*, const TensorNDArray&) {} + static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); auto inp = tensors; @@ -151,6 +159,8 @@ struct OprProxy { opr->deduce_layout(inp, layouts.back()); } + static void init(ElemwiseMultiType*, const TensorNDArray&) {} + static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); auto inp = tensors; @@ -169,6 +179,8 @@ struct OprProxy { opr->deduce_layout(inp, layouts.back()); } + static void init(ConcatForward*, const TensorNDArray&) {} + void exec(ConcatForward* opr, const TensorNDArray& tensors) { if (!W.valid()) { W = WorkspaceWrapper(opr->handle(), 0); @@ -200,6 +212,8 @@ struct OprProxy { opr->deduce_layout(inp, layouts.back()); } + static void init(CheckNonFinite*, const TensorNDArray&) {} + static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); auto inps = tensors; @@ -220,6 +234,9 @@ struct OprProxy { template <> struct OprProxy : DeduceLayoutProxy { WorkspaceWrapper W; + + void init(SplitForward*, const TensorNDArray&) {} + void exec(SplitForward* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() >= 2); if (!W.valid()) { @@ -428,7 +445,9 @@ struct OprProxyProfilingBase best_algo); } - void exec(Opr* opr, const TensorNDArray& tensors) { + virtual void init(Opr*, const TensorNDArray&) {} + + virtual void exec(Opr* opr, const TensorNDArray& tensors) { megdnn_assert(tensors.size() == arity); if (!W.valid()) { W = WorkspaceWrapper(opr->handle(), 0); @@ -463,6 +482,8 @@ struct OprProxyProfilingBase } AlgoProxy::exec(opr, tensors, W.workspace()); } + + virtual ~OprProxyProfilingBase() {} }; #define DEF_PROF(c) \ @@ -491,7 +512,7 @@ template struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { using Base = OprProxyProfilingBase; static constexpr int arity = OprTrait::arity; - void exec(Opr* opr, const TensorNDArray& tensors) { + void exec(Opr* opr, const TensorNDArray& tensors) override { megdnn_assert(tensors.size() == arity); if (!Base::W.valid()) { Base::W = WorkspaceWrapper(opr->handle(), 0); @@ -584,11 +605,55 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { } }; +template +struct OprWeightPreprocessProxyBenchmarkImpl + : public OprWeightPreprocessProxyImpl { + using Base = OprProxyProfilingBase; + static constexpr int arity = OprTrait::arity; + void init(Opr* opr, const TensorNDArray& tensors) override { + megdnn_assert(tensors.size() == arity); + if (!Base::W.valid()) { + Base::W = WorkspaceWrapper(opr->handle(), 0); + } + TensorLayoutArray layouts; + for (auto&& tensor : tensors) { + layouts.push_back(tensor.layout); + } + m_preprocessed_tensors = this->weight_prerocess( + opr, tensors, Base::target_execution_policy.algo); + megcoreSynchronize(opr->handle()->megcore_computing_handle()); + typename Opr::PreprocessedFilter preprocessed_filter{ + nullptr, *m_preprocessed_tensors}; + if (!Base::target_execution_policy.algo.valid()) { + auto workspace_size = AlgoProxy::get_workspace_in_bytes( + opr, layouts, &preprocessed_filter); + Base::W.update(workspace_size); + } + } + + void exec(Opr* opr, const TensorNDArray& tensors) override { + megdnn_assert(tensors.size() == arity); + typename Opr::PreprocessedFilter preprocessed_filter{ + nullptr, *m_preprocessed_tensors}; + AlgoProxy::exec( + opr, tensors, &preprocessed_filter, Base::W.workspace()); + } + +public: + std::shared_ptr m_preprocessed_tensors; +}; + #define DEF_PROF(c) \ template <> \ struct OprWeightPreprocessProxy : public OprWeightPreprocessProxyImpl { \ using OprWeightPreprocessProxyImpl::OprWeightPreprocessProxyImpl; \ - } + }; \ + template <> \ + struct OprWeightPreprocessBenchmarkProxy \ + : public OprWeightPreprocessProxyBenchmarkImpl { \ + using OprWeightPreprocessProxyBenchmarkImpl< \ + c>::OprWeightPreprocessProxyBenchmarkImpl; \ + }; DEF_PROF(ConvolutionForward); DEF_PROF(ConvBias); diff --git a/dnn/test/common/topk.h b/dnn/test/common/topk.h index 1163433c..0bd5baeb 100644 --- a/dnn/test/common/topk.h +++ b/dnn/test/common/topk.h @@ -16,6 +16,7 @@ private: public: OprProxy() = default; OprProxy(int k) : m_k{k} {} + void init(TopK*, const TensorLayoutArray&) {} void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { if (layouts.size() == 3) {