GitOrigin-RevId: 1797f3b91c
dev-support-lite-fork-debug-mode
@@ -14,7 +14,7 @@ | |||||
namespace megdnn { | namespace megdnn { | ||||
namespace test { | namespace test { | ||||
template <typename Opr, typename T> | |||||
template <typename Opr, typename T, typename Proxy = OprProxy<Opr>> | |||||
class BenchmarkerBase { | class BenchmarkerBase { | ||||
public: | public: | ||||
using Param = typename Opr::Param; | using Param = typename Opr::Param; | ||||
@@ -28,7 +28,7 @@ public: | |||||
m_handle(handle), | m_handle(handle), | ||||
m_default_rng(new NormalRNG()), | m_default_rng(new NormalRNG()), | ||||
m_param(Param()), | m_param(Param()), | ||||
m_proxy{new OprProxy<Opr>()} {} | |||||
m_proxy{new Proxy()} {} | |||||
const Handle* handle() const { return m_handle; } | const Handle* handle() const { return m_handle; } | ||||
@@ -81,12 +81,12 @@ public: | |||||
} | } | ||||
return layouts; | return layouts; | ||||
} | } | ||||
BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) { | |||||
BenchmarkerBase& set_proxy(std::unique_ptr<Proxy>& proxy) { | |||||
m_proxy.reset(nullptr); | m_proxy.reset(nullptr); | ||||
m_proxy = std::move(proxy); | m_proxy = std::move(proxy); | ||||
return *this; | return *this; | ||||
} | } | ||||
std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; } | |||||
std::unique_ptr<Proxy>& proxy() { return m_proxy; } | |||||
BenchmarkerBase& set_times(size_t times) { | BenchmarkerBase& set_times(size_t times) { | ||||
m_times = times; | m_times = times; | ||||
return *this; | return *this; | ||||
@@ -135,14 +135,14 @@ private: | |||||
std::map<size_t, DType> m_dtype; | std::map<size_t, DType> m_dtype; | ||||
std::map<size_t, TensorFormat> m_fmt; | std::map<size_t, TensorFormat> m_fmt; | ||||
Param m_param; | Param m_param; | ||||
std::unique_ptr<OprProxy<Opr>> m_proxy; | |||||
std::unique_ptr<Proxy> m_proxy; | |||||
BeforeExecCallback m_before_exec_callback; | BeforeExecCallback m_before_exec_callback; | ||||
std::unique_ptr<Opr> m_opr; | std::unique_ptr<Opr> m_opr; | ||||
TensorsConstriant m_tensor_constraint; | TensorsConstriant m_tensor_constraint; | ||||
}; | }; | ||||
template <typename Opr, typename T> | |||||
float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||||
template <typename Opr, typename T, typename OprProxy> | |||||
float BenchmarkerBase<Opr, T, OprProxy>::exec(TensorLayoutArray layouts) { | |||||
auto opr = this->opr(); | auto opr = this->opr(); | ||||
opr->param() = m_param; | opr->param() = m_param; | ||||
auto user_layouts = layouts; | auto user_layouts = layouts; | ||||
@@ -196,6 +196,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||||
if (m_before_exec_callback) { | if (m_before_exec_callback) { | ||||
m_before_exec_callback(opr, tensors_cur); | m_before_exec_callback(opr, tensors_cur); | ||||
} | } | ||||
//! init weights | |||||
m_proxy->init(opr, tensors_cur); | |||||
// run | // run | ||||
// warm up | // warm up | ||||
m_proxy->exec(opr, tensors_cur); | m_proxy->exec(opr, tensors_cur); | ||||
@@ -246,8 +248,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||||
return time_in_ms; | return time_in_ms; | ||||
} | } | ||||
template <typename Opr, typename T> | |||||
float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||||
template <typename Opr, typename T, typename Proxy> | |||||
float BenchmarkerBase<Opr, T, Proxy>::exect(const TensorValueArray& testcase_in) { | |||||
auto opr = this->opr(); | auto opr = this->opr(); | ||||
opr->param() = m_param; | opr->param() = m_param; | ||||
TensorLayoutArray layouts; | TensorLayoutArray layouts; | ||||
@@ -295,6 +297,8 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||||
if (m_before_exec_callback) { | if (m_before_exec_callback) { | ||||
m_before_exec_callback(opr, tensors_cur); | m_before_exec_callback(opr, tensors_cur); | ||||
} | } | ||||
//! init weights | |||||
m_proxy->init(opr, tensors_cur); | |||||
//! run | //! run | ||||
//! warm up | //! warm up | ||||
m_proxy->exec(opr, tensors_cur); | m_proxy->exec(opr, tensors_cur); | ||||
@@ -344,19 +348,16 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||||
return time_in_ms; | return time_in_ms; | ||||
} | } | ||||
template <typename Opr, typename T = Timer> | |||||
class Benchmarker; | |||||
template <typename Opr> | |||||
class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> { | |||||
template <typename Opr, typename T = Timer, typename Proxy = OprProxy<Opr>> | |||||
class Benchmarker : public BenchmarkerBase<Opr, T, Proxy> { | |||||
public: | public: | ||||
Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {} | |||||
Benchmarker(Handle* handle) : BenchmarkerBase<Opr, T, Proxy>{handle, Timer{}} {} | |||||
}; | }; | ||||
////////////////// Algo Benchmark //////////////////////// | ////////////////// Algo Benchmark //////////////////////// | ||||
template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | ||||
float algo_benchmark( | float algo_benchmark( | ||||
Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts, | |||||
Benchmarker<Opr, T, Proxy>& benchmark, TensorLayoutArray layouts, | |||||
const std::string& algo_base) { | const std::string& algo_base) { | ||||
Proxy proxy; | Proxy proxy; | ||||
auto opr = benchmark.opr(); | auto opr = benchmark.opr(); | ||||
@@ -381,7 +382,7 @@ float algo_benchmark( | |||||
template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | ||||
float algo_benchmark( | float algo_benchmark( | ||||
Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes, | |||||
Benchmarker<Opr, T, Proxy>& benchmark, TensorShapeArray shapes, | |||||
const std::string& algo_base) { | const std::string& algo_base) { | ||||
return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); | return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); | ||||
} | } | ||||
@@ -995,6 +995,52 @@ void benchmark_winograd( | |||||
used / used_winograd); | used / used_winograd); | ||||
} | } | ||||
} | } | ||||
// usage of weight pre-processing for winograd benchmark | |||||
void benchmark_winograd_weight_preprocess( | |||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||||
size_t pack_size) { | |||||
auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||||
using namespace conv_bias; | |||||
constexpr size_t RUN = 10; | |||||
//! here!!! | |||||
Benchmarker<ConvBias, Timer, OprWeightPreprocessBenchmarkProxy<ConvBias>> | |||||
benchmark_winograd(handle); | |||||
benchmark_winograd.set_display(false); | |||||
benchmark_winograd.set_times(RUN); | |||||
for (auto&& arg : args) { | |||||
TensorLayout dst_layout; | |||||
auto opr = handle->create_operator<ConvBias>(); | |||||
opr->param() = arg.param; | |||||
opr->deduce_layout( | |||||
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | |||||
{arg.bias, dtype::Float32()}, {}, dst_layout); | |||||
//! dst.nr_elems * IC * FH * FW * 2 | |||||
float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||||
arg.filter[2] * arg.filter[3] * 2.0 / | |||||
(1024 * 1024 * 1024) * 1e3; | |||||
param::Convolution conv_param; | |||||
conv_param.pad_h = arg.param.pad_h; | |||||
conv_param.pad_w = arg.param.pad_w; | |||||
conv_param.stride_h = arg.param.stride_h; | |||||
conv_param.stride_w = arg.param.stride_w; | |||||
benchmark_winograd.set_param(arg.param); | |||||
auto used_winograd = | |||||
algo_benchmark< | |||||
ConvBias, OprWeightPreprocessBenchmarkProxy<ConvBias>, Timer>( | |||||
benchmark_winograd, {arg.src, arg.filter, {}, {}, {}}, | |||||
algo_name) / | |||||
RUN; | |||||
printf("%s %s: %s: %f ms %f Gflops\n", arg.src.to_string().c_str(), | |||||
arg.filter.to_string().c_str(), algo_name, used_winograd, | |||||
computations / used_winograd); | |||||
} | |||||
} | |||||
#endif // MEGDNN_WITH_BENCHMARK | #endif // MEGDNN_WITH_BENCHMARK | ||||
template <class Checker> | template <class Checker> | ||||
@@ -66,6 +66,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | |||||
void benchmark_winograd( | void benchmark_winograd( | ||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | const char* algo_name, megdnn::Handle* handle, size_t kernel, | ||||
size_t pack_size = 1); | size_t pack_size = 1); | ||||
void benchmark_winograd_weight_preprocess( | |||||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||||
size_t pack_size = 1); | |||||
#endif // MEGDNN_WITH_BENCHMARK | #endif // MEGDNN_WITH_BENCHMARK | ||||
template <class Checker> | template <class Checker> | ||||
void check_winograd( | void check_winograd( | ||||
@@ -114,7 +114,10 @@ template < | |||||
bool has_workspace = OprTrait<Opr>::has_workspace, | bool has_workspace = OprTrait<Opr>::has_workspace, | ||||
bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | ||||
struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>, | struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>, | ||||
public ExecProxy<Opr, arity, has_workspace> {}; | |||||
public ExecProxy<Opr, arity, has_workspace> { | |||||
virtual void init(Opr*, const TensorNDArray&) {} | |||||
virtual ~OprProxyDefaultImpl() {} | |||||
}; | |||||
template <typename Opr> | template <typename Opr> | ||||
struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | ||||
@@ -123,6 +126,9 @@ template <typename Opr> | |||||
struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | ||||
template <typename Opr> | template <typename Opr> | ||||
struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl<Opr> {}; | |||||
template <typename Opr> | |||||
struct OprProxyVectorToSingle {}; | struct OprProxyVectorToSingle {}; | ||||
template <> | template <> | ||||
@@ -134,6 +140,8 @@ struct OprProxy<ElemwiseForward> { | |||||
opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
} | } | ||||
static void init(ElemwiseForward*, const TensorNDArray&) {} | |||||
static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { | static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { | ||||
megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
auto inp = tensors; | auto inp = tensors; | ||||
@@ -151,6 +159,8 @@ struct OprProxy<ElemwiseMultiType> { | |||||
opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
} | } | ||||
static void init(ElemwiseMultiType*, const TensorNDArray&) {} | |||||
static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { | static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { | ||||
megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
auto inp = tensors; | auto inp = tensors; | ||||
@@ -169,6 +179,8 @@ struct OprProxy<ConcatForward> { | |||||
opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
} | } | ||||
static void init(ConcatForward*, const TensorNDArray&) {} | |||||
void exec(ConcatForward* opr, const TensorNDArray& tensors) { | void exec(ConcatForward* opr, const TensorNDArray& tensors) { | ||||
if (!W.valid()) { | if (!W.valid()) { | ||||
W = WorkspaceWrapper(opr->handle(), 0); | W = WorkspaceWrapper(opr->handle(), 0); | ||||
@@ -200,6 +212,8 @@ struct OprProxy<CheckNonFinite> { | |||||
opr->deduce_layout(inp, layouts.back()); | opr->deduce_layout(inp, layouts.back()); | ||||
} | } | ||||
static void init(CheckNonFinite*, const TensorNDArray&) {} | |||||
static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { | static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { | ||||
megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
auto inps = tensors; | auto inps = tensors; | ||||
@@ -220,6 +234,9 @@ struct OprProxy<CheckNonFinite> { | |||||
template <> | template <> | ||||
struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | ||||
WorkspaceWrapper W; | WorkspaceWrapper W; | ||||
void init(SplitForward*, const TensorNDArray&) {} | |||||
void exec(SplitForward* opr, const TensorNDArray& tensors) { | void exec(SplitForward* opr, const TensorNDArray& tensors) { | ||||
megdnn_assert(tensors.size() >= 2); | megdnn_assert(tensors.size() >= 2); | ||||
if (!W.valid()) { | if (!W.valid()) { | ||||
@@ -428,7 +445,9 @@ struct OprProxyProfilingBase | |||||
best_algo); | best_algo); | ||||
} | } | ||||
void exec(Opr* opr, const TensorNDArray& tensors) { | |||||
virtual void init(Opr*, const TensorNDArray&) {} | |||||
virtual void exec(Opr* opr, const TensorNDArray& tensors) { | |||||
megdnn_assert(tensors.size() == arity); | megdnn_assert(tensors.size() == arity); | ||||
if (!W.valid()) { | if (!W.valid()) { | ||||
W = WorkspaceWrapper(opr->handle(), 0); | W = WorkspaceWrapper(opr->handle(), 0); | ||||
@@ -463,6 +482,8 @@ struct OprProxyProfilingBase | |||||
} | } | ||||
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | ||||
} | } | ||||
virtual ~OprProxyProfilingBase() {} | |||||
}; | }; | ||||
#define DEF_PROF(c) \ | #define DEF_PROF(c) \ | ||||
@@ -491,7 +512,7 @@ template <class Opr> | |||||
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | ||||
using Base = OprProxyProfilingBase<Opr>; | using Base = OprProxyProfilingBase<Opr>; | ||||
static constexpr int arity = OprTrait<Opr>::arity; | static constexpr int arity = OprTrait<Opr>::arity; | ||||
void exec(Opr* opr, const TensorNDArray& tensors) { | |||||
void exec(Opr* opr, const TensorNDArray& tensors) override { | |||||
megdnn_assert(tensors.size() == arity); | megdnn_assert(tensors.size() == arity); | ||||
if (!Base::W.valid()) { | if (!Base::W.valid()) { | ||||
Base::W = WorkspaceWrapper(opr->handle(), 0); | Base::W = WorkspaceWrapper(opr->handle(), 0); | ||||
@@ -584,11 +605,55 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||||
} | } | ||||
}; | }; | ||||
template <class Opr> | |||||
struct OprWeightPreprocessProxyBenchmarkImpl | |||||
: public OprWeightPreprocessProxyImpl<Opr> { | |||||
using Base = OprProxyProfilingBase<Opr>; | |||||
static constexpr int arity = OprTrait<Opr>::arity; | |||||
void init(Opr* opr, const TensorNDArray& tensors) override { | |||||
megdnn_assert(tensors.size() == arity); | |||||
if (!Base::W.valid()) { | |||||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||||
} | |||||
TensorLayoutArray layouts; | |||||
for (auto&& tensor : tensors) { | |||||
layouts.push_back(tensor.layout); | |||||
} | |||||
m_preprocessed_tensors = this->weight_prerocess( | |||||
opr, tensors, Base::target_execution_policy.algo); | |||||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||||
typename Opr::PreprocessedFilter preprocessed_filter{ | |||||
nullptr, *m_preprocessed_tensors}; | |||||
if (!Base::target_execution_policy.algo.valid()) { | |||||
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||||
opr, layouts, &preprocessed_filter); | |||||
Base::W.update(workspace_size); | |||||
} | |||||
} | |||||
void exec(Opr* opr, const TensorNDArray& tensors) override { | |||||
megdnn_assert(tensors.size() == arity); | |||||
typename Opr::PreprocessedFilter preprocessed_filter{ | |||||
nullptr, *m_preprocessed_tensors}; | |||||
AlgoProxy<Opr, arity>::exec( | |||||
opr, tensors, &preprocessed_filter, Base::W.workspace()); | |||||
} | |||||
public: | |||||
std::shared_ptr<TensorNDArray> m_preprocessed_tensors; | |||||
}; | |||||
#define DEF_PROF(c) \ | #define DEF_PROF(c) \ | ||||
template <> \ | template <> \ | ||||
struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \ | struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \ | ||||
using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | ||||
} | |||||
}; \ | |||||
template <> \ | |||||
struct OprWeightPreprocessBenchmarkProxy<c> \ | |||||
: public OprWeightPreprocessProxyBenchmarkImpl<c> { \ | |||||
using OprWeightPreprocessProxyBenchmarkImpl< \ | |||||
c>::OprWeightPreprocessProxyBenchmarkImpl; \ | |||||
}; | |||||
DEF_PROF(ConvolutionForward); | DEF_PROF(ConvolutionForward); | ||||
DEF_PROF(ConvBias); | DEF_PROF(ConvBias); | ||||
@@ -16,6 +16,7 @@ private: | |||||
public: | public: | ||||
OprProxy() = default; | OprProxy() = default; | ||||
OprProxy(int k) : m_k{k} {} | OprProxy(int k) : m_k{k} {} | ||||
void init(TopK*, const TensorLayoutArray&) {} | |||||
void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { | void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { | ||||
if (layouts.size() == 3) { | if (layouts.size() == 3) { | ||||