You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. /**
  2. * \file dnn/test/common/benchmarker.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include <map>
  13. #include <memory>
  14. #include <vector>
  15. #include <regex>
  16. #include "megdnn/basic_types.h"
  17. #include "megdnn/tensor_format.h"
  18. #include "test/common/opr_proxy.h"
  19. #include "test/common/rng.h"
  20. #include "test/common/timer.h"
  21. #include "test/common/opr_algo_proxy.h"
  22. namespace megdnn {
  23. namespace test {
  24. template <typename Opr, typename T>
  25. class BenchmarkerBase {
  26. public:
  27. using Param = typename Opr::Param;
  28. using TensorValueArray = TensorNDArray;
  29. using BeforeExecCallback =
  30. std::function<void(Opr*, const TensorValueArray&)>;
  31. BenchmarkerBase(Handle* handle, T timer)
  32. : m_timer(timer),
  33. m_handle_naive(create_cpu_handle(2, false)),
  34. m_handle(handle),
  35. m_default_rng(new NormalRNG()),
  36. m_param(Param()),
  37. m_proxy{new OprProxy<Opr>()} {}
  38. const Handle* handle() const { return m_handle; }
  39. /*!
  40. * \brief benchmark opr on current param/dtype/rng config
  41. * \returns elapsed time in ms
  42. *
  43. * Benchmarker would construct TensorLayout vectors from shapes and
  44. * dtypes and call exec(TensorLayoutArray &).
  45. */
  46. float exec(const TensorShapeArray& shapes) {
  47. return exec(make_layouts(shapes));
  48. }
  49. float exec(TensorLayoutArray layouts);
  50. //! disabiguate overloaded exec
  51. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  52. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  53. BenchmarkerBase& set_param(Param param) {
  54. m_param = param;
  55. return *this;
  56. }
  57. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  58. m_dtype[idx] = dtype;
  59. return *this;
  60. }
  61. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  62. m_rng[idx] = rng;
  63. return *this;
  64. }
  65. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  66. m_fmt[idx] = fmt;
  67. return *this;
  68. }
  69. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  70. TensorLayoutArray layouts(shapes.size());
  71. for (size_t i = 0; i < shapes.size(); ++i) {
  72. DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
  73. : dtype::Float32());
  74. TensorFormat fmt = (m_fmt.find(i) != m_fmt.end()
  75. ? m_fmt[i]
  76. : DefaultTensorFormat::make());
  77. layouts[i] = TensorLayout(shapes[i], dt, fmt);
  78. }
  79. return layouts;
  80. }
  81. BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
  82. m_proxy.reset(nullptr);
  83. m_proxy = std::move(proxy);
  84. return *this;
  85. }
  86. std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
  87. BenchmarkerBase& set_times(size_t times) {
  88. m_times = times;
  89. return *this;
  90. }
  91. BenchmarkerBase& set_display(bool display) {
  92. m_display = display;
  93. return *this;
  94. }
  95. //! set a callback to be invoked before executing the operator
  96. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  97. m_before_exec_callback = cb;
  98. return *this;
  99. }
  100. /*!
  101. * \brief set adaptive benchmarking: ignore set_times() and find
  102. * suitable times to run for given duration;
  103. *
  104. * Note: the value returned by exec() would be average time per run,
  105. * rather than total elapsed time, if this is enabled.
  106. */
  107. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  108. m_adaptive_secs = tot_time_in_secs;
  109. return *this;
  110. }
  111. //! get the opr impl so setting other than param() can be modified
  112. Opr* opr() {
  113. if (!m_opr) {
  114. m_opr = m_handle->create_operator<Opr>();
  115. }
  116. return m_opr.get();
  117. }
  118. const Param& param() const { return m_param; }
  119. private:
  120. T m_timer;
  121. bool m_display = true;
  122. size_t m_times = 1;
  123. float m_adaptive_secs = 0;
  124. std::unique_ptr<Handle> m_handle_naive;
  125. Handle* m_handle;
  126. std::unique_ptr<RNG> m_default_rng;
  127. std::map<size_t, RNG*> m_rng;
  128. std::map<size_t, DType> m_dtype;
  129. std::map<size_t, TensorFormat> m_fmt;
  130. Param m_param;
  131. std::unique_ptr<OprProxy<Opr>> m_proxy;
  132. BeforeExecCallback m_before_exec_callback;
  133. std::unique_ptr<Opr> m_opr;
  134. };
  135. template <typename Opr, typename T>
  136. float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
  137. auto opr = this->opr();
  138. opr->param() = m_param;
  139. auto user_layouts = layouts;
  140. m_proxy->deduce_layout(opr, layouts);
  141. for (size_t i = 0; i < layouts.size(); ++i)
  142. if (user_layouts[i].ndim > 0) {
  143. auto run = [&]() {
  144. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  145. << "User provided shape is "
  146. << user_layouts[i].TensorShape::to_string()
  147. << "\nExpected shape is "
  148. << layouts[i].TensorShape::to_string();
  149. };
  150. run();
  151. }
  152. auto allocate = [&layouts](Handle* handle) {
  153. TensorNDArray tensors(layouts.size());
  154. auto trans_func = [handle](const TensorLayout& layout) {
  155. auto span = layout.span();
  156. TensorND res;
  157. res.raw_ptr = static_cast<uint8_t*>(
  158. megdnn_malloc(handle, span.dist_byte())) +
  159. span.low_byte;
  160. res.layout = layout;
  161. return res;
  162. };
  163. std::transform(layouts.begin(), layouts.end(), tensors.begin(),
  164. trans_func);
  165. return tensors;
  166. };
  167. auto tensors_cur = allocate(m_handle);
  168. auto tensors_cur_host = allocate(m_handle_naive.get());
  169. // init
  170. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  171. TensorND& tensor = tensors_cur_host[i];
  172. auto rng = m_rng[i];
  173. if (!rng)
  174. rng = m_default_rng.get();
  175. auto size = tensor.layout.span().high_byte;
  176. rng->gen(tensor);
  177. if (tensor.layout.ndim == 0)
  178. continue;
  179. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
  180. size);
  181. }
  182. if (m_before_exec_callback) {
  183. m_before_exec_callback(opr, tensors_cur);
  184. }
  185. // run
  186. // warm up
  187. m_proxy->exec(opr, tensors_cur);
  188. megcoreSynchronize(m_handle->megcore_computing_handle());
  189. if (m_adaptive_secs) {
  190. // find m_times for adaptive benchmarking
  191. m_times = 0;
  192. int cur_times = 1;
  193. auto remain_time = m_adaptive_secs * 1e6;
  194. while (remain_time > 0) {
  195. m_timer.reset();
  196. m_timer.start();
  197. for (int i = 0; i < cur_times; ++i)
  198. m_proxy->exec(opr, tensors_cur);
  199. megcoreSynchronize(m_handle->megcore_computing_handle());
  200. m_timer.stop();
  201. m_times += cur_times;
  202. auto this_run_time = m_timer.get_time_in_us();
  203. remain_time -= this_run_time;
  204. cur_times = std::min(
  205. cur_times * 2,
  206. std::max<int>(1, remain_time / this_run_time * cur_times));
  207. }
  208. }
  209. m_timer.reset();
  210. m_timer.start();
  211. for (size_t t = 0; t < m_times; ++t)
  212. m_proxy->exec(opr, tensors_cur);
  213. megcoreSynchronize(m_handle->megcore_computing_handle());
  214. m_timer.stop();
  215. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  216. if (m_display) {
  217. std::cout << "Total time is " << time_in_ms << "ms "
  218. << "for " << m_times << " run(s)." << std::endl;
  219. }
  220. auto free = [](Handle* handle, TensorNDArray& tensors) {
  221. std::for_each(tensors.begin(), tensors.end(),
  222. [handle](const TensorND& tensor) {
  223. megdnn_free(handle, tensor.raw_ptr);
  224. });
  225. };
  226. free(m_handle, tensors_cur);
  227. free(m_handle_naive.get(), tensors_cur_host);
  228. if (m_adaptive_secs)
  229. time_in_ms /= m_times;
  230. return time_in_ms;
  231. }
  232. template <typename Opr, typename T = Timer>
  233. class Benchmarker;
  234. template <typename Opr>
  235. class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
  236. public:
  237. Benchmarker(Handle* handle)
  238. : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
  239. };
  240. ////////////////// Algo Benchmark ////////////////////////
  241. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  242. float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
  243. const std::string& algo_base) {
  244. Proxy proxy;
  245. auto opr = benchmark.opr();
  246. opr->param() = benchmark.param();
  247. proxy.deduce_layout(opr, layouts);
  248. auto algos = OprAlgoProxy<Opr>::get_all_algorithms(opr, layouts);
  249. float min_used = std::numeric_limits<float>::max();
  250. bool execed = false;
  251. for (auto i : algos) {
  252. if (std::regex_match(i->name(),
  253. std::regex("(" + algo_base + ")(.*)"))) {
  254. opr->execution_policy().algorithm = i;
  255. auto used = benchmark.exec(layouts);
  256. min_used = std::min(min_used, used);
  257. printf("run algo: %s used: %f ms min_used: %f ms\n", i->name(),
  258. used, min_used);
  259. execed = true;
  260. }
  261. }
  262. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  263. return min_used;
  264. }
  265. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  266. float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
  267. const std::string& algo_base) {
  268. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  269. }
  270. } // namespace test
  271. } // namespace megdnn
  272. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台