You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. /**
  2. * \file dnn/test/common/benchmarker.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <map>
  14. #include <memory>
  15. #include <regex>
  16. #include <vector>
  17. #include "megdnn/basic_types.h"
  18. #include "megdnn/tensor_format.h"
  19. #include "test/common/opr_algo_proxy.h"
  20. #include "test/common/opr_proxy.h"
  21. #include "test/common/rng.h"
  22. #include "test/common/timer.h"
  23. namespace megdnn {
  24. namespace test {
  25. template <typename Opr, typename T>
  26. class BenchmarkerBase {
  27. public:
  28. using Param = typename Opr::Param;
  29. using TensorValueArray = TensorNDArray;
  30. using BeforeExecCallback =
  31. std::function<void(Opr*, const TensorValueArray&)>;
  32. using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
  33. BenchmarkerBase(Handle* handle, T timer)
  34. : m_timer(timer),
  35. m_handle_naive(create_cpu_handle(2, false)),
  36. m_handle(handle),
  37. m_default_rng(new NormalRNG()),
  38. m_param(Param()),
  39. m_proxy{new OprProxy<Opr>()} {}
  40. const Handle* handle() const { return m_handle; }
  41. /*!
  42. * \brief benchmark opr on current param/dtype/rng config
  43. * \returns elapsed time in ms
  44. *
  45. * Benchmarker would construct TensorLayout vectors from shapes and
  46. * dtypes and call exec(TensorLayoutArray &).
  47. */
  48. float exec(const TensorShapeArray& shapes) {
  49. return exec(make_layouts(shapes));
  50. }
  51. float exec(TensorLayoutArray layouts);
  52. float exect(const TensorValueArray& testcase_in);
  53. //! disabiguate overloaded exec
  54. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  55. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  56. BenchmarkerBase& set_param(Param param) {
  57. m_param = param;
  58. return *this;
  59. }
  60. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  61. m_dtype[idx] = dtype;
  62. return *this;
  63. }
  64. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  65. m_rng[idx] = rng;
  66. return *this;
  67. }
  68. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  69. m_fmt[idx] = fmt;
  70. return *this;
  71. }
  72. BenchmarkerBase& set_tensors_constraint(
  73. const TensorsConstriant& tensor_constraint) {
  74. m_tensor_constraint = tensor_constraint;
  75. return *this;
  76. }
  77. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  78. TensorLayoutArray layouts(shapes.size());
  79. for (size_t i = 0; i < shapes.size(); ++i) {
  80. DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
  81. : dtype::Float32());
  82. TensorFormat fmt = (m_fmt.find(i) != m_fmt.end()
  83. ? m_fmt[i]
  84. : DefaultTensorFormat::make());
  85. layouts[i] = TensorLayout(shapes[i], dt, fmt);
  86. }
  87. return layouts;
  88. }
  89. BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
  90. m_proxy.reset(nullptr);
  91. m_proxy = std::move(proxy);
  92. return *this;
  93. }
  94. std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
  95. BenchmarkerBase& set_times(size_t times) {
  96. m_times = times;
  97. return *this;
  98. }
  99. BenchmarkerBase& set_display(bool display) {
  100. m_display = display;
  101. return *this;
  102. }
  103. //! set a callback to be invoked before executing the operator
  104. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  105. m_before_exec_callback = cb;
  106. return *this;
  107. }
  108. /*!
  109. * \brief set adaptive benchmarking: ignore set_times() and find
  110. * suitable times to run for given duration;
  111. *
  112. * Note: the value returned by exec() would be average time per run,
  113. * rather than total elapsed time, if this is enabled.
  114. */
  115. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  116. m_adaptive_secs = tot_time_in_secs;
  117. return *this;
  118. }
  119. //! get the opr impl so setting other than param() can be modified
  120. Opr* opr() {
  121. if (!m_opr) {
  122. m_opr = m_handle->create_operator<Opr>();
  123. }
  124. return m_opr.get();
  125. }
  126. const Param& param() const { return m_param; }
  127. private:
  128. T m_timer;
  129. bool m_display = true;
  130. size_t m_times = 1;
  131. float m_adaptive_secs = 0;
  132. std::unique_ptr<Handle> m_handle_naive;
  133. Handle* m_handle;
  134. std::unique_ptr<RNG> m_default_rng;
  135. std::map<size_t, RNG*> m_rng;
  136. std::map<size_t, DType> m_dtype;
  137. std::map<size_t, TensorFormat> m_fmt;
  138. Param m_param;
  139. std::unique_ptr<OprProxy<Opr>> m_proxy;
  140. BeforeExecCallback m_before_exec_callback;
  141. std::unique_ptr<Opr> m_opr;
  142. TensorsConstriant m_tensor_constraint;
  143. };
  144. template <typename Opr, typename T>
  145. float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
  146. auto opr = this->opr();
  147. opr->param() = m_param;
  148. auto user_layouts = layouts;
  149. m_proxy->deduce_layout(opr, layouts);
  150. for (size_t i = 0; i < layouts.size(); ++i)
  151. if (user_layouts[i].ndim > 0) {
  152. auto run = [&]() {
  153. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  154. << "User provided shape is "
  155. << user_layouts[i].TensorShape::to_string()
  156. << "\nExpected shape is "
  157. << layouts[i].TensorShape::to_string();
  158. };
  159. run();
  160. }
  161. auto allocate = [&layouts](Handle* handle) {
  162. TensorNDArray tensors(layouts.size());
  163. auto trans_func = [handle](const TensorLayout& layout) {
  164. auto span = layout.span();
  165. TensorND res;
  166. res.raw_ptr = static_cast<uint8_t*>(
  167. megdnn_malloc(handle, span.dist_byte())) +
  168. span.low_byte;
  169. res.layout = layout;
  170. return res;
  171. };
  172. std::transform(layouts.begin(), layouts.end(), tensors.begin(),
  173. trans_func);
  174. return tensors;
  175. };
  176. auto tensors_cur = allocate(m_handle);
  177. auto tensors_cur_host = allocate(m_handle_naive.get());
  178. // init
  179. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  180. TensorND& tensor = tensors_cur_host[i];
  181. auto rng = m_rng[i];
  182. if (!rng)
  183. rng = m_default_rng.get();
  184. rng->gen(tensor);
  185. }
  186. if (m_tensor_constraint) {
  187. m_tensor_constraint(tensors_cur_host);
  188. }
  189. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  190. TensorND& tensor = tensors_cur_host[i];
  191. if (tensor.layout.ndim == 0)
  192. continue;
  193. auto size = tensor.layout.span().high_byte;
  194. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
  195. size);
  196. }
  197. if (m_before_exec_callback) {
  198. m_before_exec_callback(opr, tensors_cur);
  199. }
  200. // run
  201. // warm up
  202. m_proxy->exec(opr, tensors_cur);
  203. megcoreSynchronize(m_handle->megcore_computing_handle());
  204. if (m_adaptive_secs) {
  205. // find m_times for adaptive benchmarking
  206. m_times = 0;
  207. int cur_times = 1;
  208. auto remain_time = m_adaptive_secs * 1e6;
  209. while (remain_time > 0) {
  210. m_timer.reset();
  211. m_timer.start();
  212. for (int i = 0; i < cur_times; ++i)
  213. m_proxy->exec(opr, tensors_cur);
  214. megcoreSynchronize(m_handle->megcore_computing_handle());
  215. m_timer.stop();
  216. m_times += cur_times;
  217. auto this_run_time = m_timer.get_time_in_us();
  218. remain_time -= this_run_time;
  219. cur_times = std::min(
  220. cur_times * 2,
  221. std::max<int>(1, remain_time / this_run_time * cur_times));
  222. }
  223. }
  224. m_timer.reset();
  225. m_timer.start();
  226. for (size_t t = 0; t < m_times; ++t)
  227. m_proxy->exec(opr, tensors_cur);
  228. megcoreSynchronize(m_handle->megcore_computing_handle());
  229. m_timer.stop();
  230. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  231. if (m_display) {
  232. std::cout << "Total time is " << time_in_ms << "ms "
  233. << "for " << m_times << " run(s)." << std::endl;
  234. }
  235. auto free = [](Handle* handle, TensorNDArray& tensors) {
  236. std::for_each(tensors.begin(), tensors.end(),
  237. [handle](const TensorND& tensor) {
  238. megdnn_free(handle, tensor.raw_ptr);
  239. });
  240. };
  241. free(m_handle, tensors_cur);
  242. free(m_handle_naive.get(), tensors_cur_host);
  243. if (m_adaptive_secs)
  244. time_in_ms /= m_times;
  245. return time_in_ms;
  246. }
  247. template <typename Opr, typename T>
  248. float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
  249. auto opr = this->opr();
  250. opr->param() = m_param;
  251. TensorLayoutArray layouts;
  252. TensorNDArray tensors_cur_host;
  253. for (auto& inp : testcase_in) {
  254. layouts.push_back(inp.layout);
  255. tensors_cur_host.emplace_back(inp);
  256. }
  257. auto user_layouts = layouts;
  258. m_proxy->deduce_layout(opr, layouts);
  259. for (size_t i = 0; i < layouts.size(); ++i)
  260. if (user_layouts[i].ndim > 0) {
  261. auto run = [&]() {
  262. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  263. << "User provided shape is "
  264. << user_layouts[i].TensorShape::to_string()
  265. << "\nExpected shape is "
  266. << layouts[i].TensorShape::to_string();
  267. };
  268. run();
  269. }
  270. auto allocate = [&layouts](Handle* handle) {
  271. TensorNDArray tensors(layouts.size());
  272. auto trans_func = [handle](const TensorLayout& layout) {
  273. auto span = layout.span();
  274. TensorND res;
  275. res.raw_ptr = static_cast<uint8_t*>(
  276. megdnn_malloc(handle, span.dist_byte())) +
  277. span.low_byte;
  278. res.layout = layout;
  279. return res;
  280. };
  281. std::transform(layouts.begin(), layouts.end(), tensors.begin(),
  282. trans_func);
  283. return tensors;
  284. };
  285. auto tensors_cur = allocate(m_handle);
  286. //! init
  287. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  288. TensorND& tensor = tensors_cur_host[i];
  289. auto size = tensor.layout.span().high_byte;
  290. if (tensor.layout.ndim == 0)
  291. continue;
  292. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
  293. size);
  294. }
  295. if (m_before_exec_callback) {
  296. m_before_exec_callback(opr, tensors_cur);
  297. }
  298. //! run
  299. //! warm up
  300. m_proxy->exec(opr, tensors_cur);
  301. megcoreSynchronize(m_handle->megcore_computing_handle());
  302. if (m_adaptive_secs) {
  303. //! find m_times for adaptive benchmarking
  304. m_times = 0;
  305. int cur_times = 1;
  306. auto remain_time = m_adaptive_secs * 1e6;
  307. while (remain_time > 0) {
  308. m_timer.reset();
  309. m_timer.start();
  310. for (int i = 0; i < cur_times; ++i)
  311. m_proxy->exec(opr, tensors_cur);
  312. megcoreSynchronize(m_handle->megcore_computing_handle());
  313. m_timer.stop();
  314. m_times += cur_times;
  315. auto this_run_time = m_timer.get_time_in_us();
  316. remain_time -= this_run_time;
  317. cur_times = std::min(
  318. cur_times * 2,
  319. std::max<int>(1, remain_time / this_run_time * cur_times));
  320. }
  321. }
  322. m_timer.reset();
  323. m_timer.start();
  324. for (size_t t = 0; t < m_times; ++t)
  325. m_proxy->exec(opr, tensors_cur);
  326. megcoreSynchronize(m_handle->megcore_computing_handle());
  327. m_timer.stop();
  328. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  329. if (m_display) {
  330. std::cout << "Total time is " << time_in_ms << "ms "
  331. << "for " << m_times << " run(s)." << std::endl;
  332. }
  333. auto free = [](Handle* handle, TensorNDArray& tensors) {
  334. std::for_each(tensors.begin(), tensors.end(),
  335. [handle](const TensorND& tensor) {
  336. megdnn_free(handle, tensor.raw_ptr);
  337. });
  338. };
  339. free(m_handle, tensors_cur);
  340. if (m_adaptive_secs)
  341. time_in_ms /= m_times;
  342. return time_in_ms;
  343. }
  344. template <typename Opr, typename T = Timer>
  345. class Benchmarker;
  346. template <typename Opr>
  347. class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
  348. public:
  349. Benchmarker(Handle* handle)
  350. : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
  351. };
  352. ////////////////// Algo Benchmark ////////////////////////
  353. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  354. float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
  355. const std::string& algo_base) {
  356. Proxy proxy;
  357. auto opr = benchmark.opr();
  358. opr->param() = benchmark.param();
  359. proxy.deduce_layout(opr, layouts);
  360. auto algos = OprAlgoProxy<Opr>::get_all_algorithms_info(opr, layouts);
  361. float min_used = std::numeric_limits<float>::max();
  362. bool execed = false;
  363. for (auto i : algos) {
  364. if (std::regex_match(i.name,
  365. std::regex("(" + algo_base + ")(.*)"))) {
  366. opr->execution_policy().algo = i;
  367. auto used = benchmark.exec(layouts);
  368. min_used = std::min(min_used, used);
  369. printf("run algo: %s used: %f ms min_used: %f ms\n", i.name.c_str(),
  370. used, min_used);
  371. execed = true;
  372. }
  373. }
  374. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  375. return min_used;
  376. }
  377. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  378. float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
  379. const std::string& algo_base) {
  380. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  381. }
  382. } // namespace test
  383. } // namespace megdnn
  384. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台