GitOrigin-RevId: 1797f3b91c
dev-support-lite-fork-debug-mode
@@ -14,7 +14,7 @@ | |||
namespace megdnn { | |||
namespace test { | |||
template <typename Opr, typename T> | |||
template <typename Opr, typename T, typename Proxy = OprProxy<Opr>> | |||
class BenchmarkerBase { | |||
public: | |||
using Param = typename Opr::Param; | |||
@@ -28,7 +28,7 @@ public: | |||
m_handle(handle), | |||
m_default_rng(new NormalRNG()), | |||
m_param(Param()), | |||
m_proxy{new OprProxy<Opr>()} {} | |||
m_proxy{new Proxy()} {} | |||
const Handle* handle() const { return m_handle; } | |||
@@ -81,12 +81,12 @@ public: | |||
} | |||
return layouts; | |||
} | |||
BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) { | |||
BenchmarkerBase& set_proxy(std::unique_ptr<Proxy>& proxy) { | |||
m_proxy.reset(nullptr); | |||
m_proxy = std::move(proxy); | |||
return *this; | |||
} | |||
std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; } | |||
std::unique_ptr<Proxy>& proxy() { return m_proxy; } | |||
BenchmarkerBase& set_times(size_t times) { | |||
m_times = times; | |||
return *this; | |||
@@ -135,14 +135,14 @@ private: | |||
std::map<size_t, DType> m_dtype; | |||
std::map<size_t, TensorFormat> m_fmt; | |||
Param m_param; | |||
std::unique_ptr<OprProxy<Opr>> m_proxy; | |||
std::unique_ptr<Proxy> m_proxy; | |||
BeforeExecCallback m_before_exec_callback; | |||
std::unique_ptr<Opr> m_opr; | |||
TensorsConstriant m_tensor_constraint; | |||
}; | |||
template <typename Opr, typename T> | |||
float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||
template <typename Opr, typename T, typename OprProxy> | |||
float BenchmarkerBase<Opr, T, OprProxy>::exec(TensorLayoutArray layouts) { | |||
auto opr = this->opr(); | |||
opr->param() = m_param; | |||
auto user_layouts = layouts; | |||
@@ -196,6 +196,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||
if (m_before_exec_callback) { | |||
m_before_exec_callback(opr, tensors_cur); | |||
} | |||
//! init weights | |||
m_proxy->init(opr, tensors_cur); | |||
// run | |||
// warm up | |||
m_proxy->exec(opr, tensors_cur); | |||
@@ -246,8 +248,8 @@ float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) { | |||
return time_in_ms; | |||
} | |||
template <typename Opr, typename T> | |||
float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||
template <typename Opr, typename T, typename Proxy> | |||
float BenchmarkerBase<Opr, T, Proxy>::exect(const TensorValueArray& testcase_in) { | |||
auto opr = this->opr(); | |||
opr->param() = m_param; | |||
TensorLayoutArray layouts; | |||
@@ -295,6 +297,8 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||
if (m_before_exec_callback) { | |||
m_before_exec_callback(opr, tensors_cur); | |||
} | |||
//! init weights | |||
m_proxy->init(opr, tensors_cur); | |||
//! run | |||
//! warm up | |||
m_proxy->exec(opr, tensors_cur); | |||
@@ -344,19 +348,16 @@ float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) { | |||
return time_in_ms; | |||
} | |||
template <typename Opr, typename T = Timer> | |||
class Benchmarker; | |||
template <typename Opr> | |||
class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> { | |||
template <typename Opr, typename T = Timer, typename Proxy = OprProxy<Opr>> | |||
class Benchmarker : public BenchmarkerBase<Opr, T, Proxy> { | |||
public: | |||
Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {} | |||
Benchmarker(Handle* handle) : BenchmarkerBase<Opr, T, Proxy>{handle, Timer{}} {} | |||
}; | |||
////////////////// Algo Benchmark //////////////////////// | |||
template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | |||
float algo_benchmark( | |||
Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts, | |||
Benchmarker<Opr, T, Proxy>& benchmark, TensorLayoutArray layouts, | |||
const std::string& algo_base) { | |||
Proxy proxy; | |||
auto opr = benchmark.opr(); | |||
@@ -381,7 +382,7 @@ float algo_benchmark( | |||
template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer> | |||
float algo_benchmark( | |||
Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes, | |||
Benchmarker<Opr, T, Proxy>& benchmark, TensorShapeArray shapes, | |||
const std::string& algo_base) { | |||
return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base); | |||
} | |||
@@ -995,6 +995,52 @@ void benchmark_winograd( | |||
used / used_winograd); | |||
} | |||
} | |||
// usage of weight pre-processing for winograd benchmark | |||
void benchmark_winograd_weight_preprocess( | |||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||
size_t pack_size) { | |||
auto&& args = get_winograd_benchmark_args(kernel, pack_size); | |||
using namespace conv_bias; | |||
constexpr size_t RUN = 10; | |||
//! here!!! | |||
Benchmarker<ConvBias, Timer, OprWeightPreprocessBenchmarkProxy<ConvBias>> | |||
benchmark_winograd(handle); | |||
benchmark_winograd.set_display(false); | |||
benchmark_winograd.set_times(RUN); | |||
for (auto&& arg : args) { | |||
TensorLayout dst_layout; | |||
auto opr = handle->create_operator<ConvBias>(); | |||
opr->param() = arg.param; | |||
opr->deduce_layout( | |||
{arg.src, dtype::Float32()}, {arg.filter, dtype::Float32()}, | |||
{arg.bias, dtype::Float32()}, {}, dst_layout); | |||
//! dst.nr_elems * IC * FH * FW * 2 | |||
float computations = dst_layout.total_nr_elems() * arg.filter[1] * | |||
arg.filter[2] * arg.filter[3] * 2.0 / | |||
(1024 * 1024 * 1024) * 1e3; | |||
param::Convolution conv_param; | |||
conv_param.pad_h = arg.param.pad_h; | |||
conv_param.pad_w = arg.param.pad_w; | |||
conv_param.stride_h = arg.param.stride_h; | |||
conv_param.stride_w = arg.param.stride_w; | |||
benchmark_winograd.set_param(arg.param); | |||
auto used_winograd = | |||
algo_benchmark< | |||
ConvBias, OprWeightPreprocessBenchmarkProxy<ConvBias>, Timer>( | |||
benchmark_winograd, {arg.src, arg.filter, {}, {}, {}}, | |||
algo_name) / | |||
RUN; | |||
printf("%s %s: %s: %f ms %f Gflops\n", arg.src.to_string().c_str(), | |||
arg.filter.to_string().c_str(), algo_name, used_winograd, | |||
computations / used_winograd); | |||
} | |||
} | |||
#endif // MEGDNN_WITH_BENCHMARK | |||
template <class Checker> | |||
@@ -66,6 +66,9 @@ std::vector<conv_bias::TestArg> get_winograd_benchmark_args( | |||
void benchmark_winograd( | |||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||
size_t pack_size = 1); | |||
void benchmark_winograd_weight_preprocess( | |||
const char* algo_name, megdnn::Handle* handle, size_t kernel, | |||
size_t pack_size = 1); | |||
#endif // MEGDNN_WITH_BENCHMARK | |||
template <class Checker> | |||
void check_winograd( | |||
@@ -114,7 +114,10 @@ template < | |||
bool has_workspace = OprTrait<Opr>::has_workspace, | |||
bool can_deduce_layout = OprTrait<Opr>::can_deduce_layout> | |||
struct OprProxyDefaultImpl : public DeduceLayoutProxy<Opr, arity, can_deduce_layout>, | |||
public ExecProxy<Opr, arity, has_workspace> {}; | |||
public ExecProxy<Opr, arity, has_workspace> { | |||
virtual void init(Opr*, const TensorNDArray&) {} | |||
virtual ~OprProxyDefaultImpl() {} | |||
}; | |||
template <typename Opr> | |||
struct OprProxy : public OprProxyDefaultImpl<Opr> {}; | |||
@@ -123,6 +126,9 @@ template <typename Opr> | |||
struct OprWeightPreprocessProxy : public OprProxyDefaultImpl<Opr> {}; | |||
template <typename Opr> | |||
struct OprWeightPreprocessBenchmarkProxy : OprProxyDefaultImpl<Opr> {}; | |||
template <typename Opr> | |||
struct OprProxyVectorToSingle {}; | |||
template <> | |||
@@ -134,6 +140,8 @@ struct OprProxy<ElemwiseForward> { | |||
opr->deduce_layout(inp, layouts.back()); | |||
} | |||
static void init(ElemwiseForward*, const TensorNDArray&) {} | |||
static void exec(ElemwiseForward* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() >= 2); | |||
auto inp = tensors; | |||
@@ -151,6 +159,8 @@ struct OprProxy<ElemwiseMultiType> { | |||
opr->deduce_layout(inp, layouts.back()); | |||
} | |||
static void init(ElemwiseMultiType*, const TensorNDArray&) {} | |||
static void exec(ElemwiseMultiType* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() >= 2); | |||
auto inp = tensors; | |||
@@ -169,6 +179,8 @@ struct OprProxy<ConcatForward> { | |||
opr->deduce_layout(inp, layouts.back()); | |||
} | |||
static void init(ConcatForward*, const TensorNDArray&) {} | |||
void exec(ConcatForward* opr, const TensorNDArray& tensors) { | |||
if (!W.valid()) { | |||
W = WorkspaceWrapper(opr->handle(), 0); | |||
@@ -200,6 +212,8 @@ struct OprProxy<CheckNonFinite> { | |||
opr->deduce_layout(inp, layouts.back()); | |||
} | |||
static void init(CheckNonFinite*, const TensorNDArray&) {} | |||
static void exec(CheckNonFinite* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() >= 2); | |||
auto inps = tensors; | |||
@@ -220,6 +234,9 @@ struct OprProxy<CheckNonFinite> { | |||
template <> | |||
struct OprProxy<SplitForward> : DeduceLayoutProxy<SplitForward, 0, false> { | |||
WorkspaceWrapper W; | |||
void init(SplitForward*, const TensorNDArray&) {} | |||
void exec(SplitForward* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() >= 2); | |||
if (!W.valid()) { | |||
@@ -428,7 +445,9 @@ struct OprProxyProfilingBase | |||
best_algo); | |||
} | |||
void exec(Opr* opr, const TensorNDArray& tensors) { | |||
virtual void init(Opr*, const TensorNDArray&) {} | |||
virtual void exec(Opr* opr, const TensorNDArray& tensors) { | |||
megdnn_assert(tensors.size() == arity); | |||
if (!W.valid()) { | |||
W = WorkspaceWrapper(opr->handle(), 0); | |||
@@ -463,6 +482,8 @@ struct OprProxyProfilingBase | |||
} | |||
AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace()); | |||
} | |||
virtual ~OprProxyProfilingBase() {} | |||
}; | |||
#define DEF_PROF(c) \ | |||
@@ -491,7 +512,7 @@ template <class Opr> | |||
struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||
using Base = OprProxyProfilingBase<Opr>; | |||
static constexpr int arity = OprTrait<Opr>::arity; | |||
void exec(Opr* opr, const TensorNDArray& tensors) { | |||
void exec(Opr* opr, const TensorNDArray& tensors) override { | |||
megdnn_assert(tensors.size() == arity); | |||
if (!Base::W.valid()) { | |||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
@@ -584,11 +605,55 @@ struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr> { | |||
} | |||
}; | |||
template <class Opr> | |||
struct OprWeightPreprocessProxyBenchmarkImpl | |||
: public OprWeightPreprocessProxyImpl<Opr> { | |||
using Base = OprProxyProfilingBase<Opr>; | |||
static constexpr int arity = OprTrait<Opr>::arity; | |||
void init(Opr* opr, const TensorNDArray& tensors) override { | |||
megdnn_assert(tensors.size() == arity); | |||
if (!Base::W.valid()) { | |||
Base::W = WorkspaceWrapper(opr->handle(), 0); | |||
} | |||
TensorLayoutArray layouts; | |||
for (auto&& tensor : tensors) { | |||
layouts.push_back(tensor.layout); | |||
} | |||
m_preprocessed_tensors = this->weight_prerocess( | |||
opr, tensors, Base::target_execution_policy.algo); | |||
megcoreSynchronize(opr->handle()->megcore_computing_handle()); | |||
typename Opr::PreprocessedFilter preprocessed_filter{ | |||
nullptr, *m_preprocessed_tensors}; | |||
if (!Base::target_execution_policy.algo.valid()) { | |||
auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes( | |||
opr, layouts, &preprocessed_filter); | |||
Base::W.update(workspace_size); | |||
} | |||
} | |||
void exec(Opr* opr, const TensorNDArray& tensors) override { | |||
megdnn_assert(tensors.size() == arity); | |||
typename Opr::PreprocessedFilter preprocessed_filter{ | |||
nullptr, *m_preprocessed_tensors}; | |||
AlgoProxy<Opr, arity>::exec( | |||
opr, tensors, &preprocessed_filter, Base::W.workspace()); | |||
} | |||
public: | |||
std::shared_ptr<TensorNDArray> m_preprocessed_tensors; | |||
}; | |||
#define DEF_PROF(c) \ | |||
template <> \ | |||
struct OprWeightPreprocessProxy<c> : public OprWeightPreprocessProxyImpl<c> { \ | |||
using OprWeightPreprocessProxyImpl<c>::OprWeightPreprocessProxyImpl; \ | |||
} | |||
}; \ | |||
template <> \ | |||
struct OprWeightPreprocessBenchmarkProxy<c> \ | |||
: public OprWeightPreprocessProxyBenchmarkImpl<c> { \ | |||
using OprWeightPreprocessProxyBenchmarkImpl< \ | |||
c>::OprWeightPreprocessProxyBenchmarkImpl; \ | |||
}; | |||
DEF_PROF(ConvolutionForward); | |||
DEF_PROF(ConvBias); | |||
@@ -16,6 +16,7 @@ private: | |||
public: | |||
OprProxy() = default; | |||
OprProxy(int k) : m_k{k} {} | |||
void init(TopK*, const TensorLayoutArray&) {} | |||
void deduce_layout(TopK* opr, TensorLayoutArray& layouts) { | |||
if (layouts.size() == 3) { | |||