You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407
  1. /**
  2. * \file dnn/test/common/benchmarker.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <map>
  14. #include <memory>
  15. #include <regex>
  16. #include <vector>
  17. #include "megdnn/basic_types.h"
  18. #include "megdnn/tensor_format.h"
  19. #include "test/common/opr_algo_proxy.h"
  20. #include "test/common/opr_proxy.h"
  21. #include "test/common/rng.h"
  22. #include "test/common/timer.h"
  23. namespace megdnn {
  24. namespace test {
  25. template <typename Opr, typename T>
  26. class BenchmarkerBase {
  27. public:
  28. using Param = typename Opr::Param;
  29. using TensorValueArray = TensorNDArray;
  30. using BeforeExecCallback =
  31. std::function<void(Opr*, const TensorValueArray&)>;
  32. using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
  33. BenchmarkerBase(Handle* handle, T timer)
  34. : m_timer(timer),
  35. m_handle_naive(create_cpu_handle(2, false)),
  36. m_handle(handle),
  37. m_default_rng(new NormalRNG()),
  38. m_param(Param()),
  39. m_proxy{new OprProxy<Opr>()} {}
  40. const Handle* handle() const { return m_handle; }
  41. /*!
  42. * \brief benchmark opr on current param/dtype/rng config
  43. * \returns elapsed time in ms
  44. *
  45. * Benchmarker would construct TensorLayout vectors from shapes and
  46. * dtypes and call exec(TensorLayoutArray &).
  47. */
  48. float exec(const TensorShapeArray& shapes) {
  49. return exec(make_layouts(shapes));
  50. }
  51. float exec(TensorLayoutArray layouts);
  52. float exect(const TensorValueArray& testcase_in);
  53. //! disabiguate overloaded exec
  54. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  55. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  56. BenchmarkerBase& set_param(Param param) {
  57. m_param = param;
  58. return *this;
  59. }
  60. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  61. m_dtype[idx] = dtype;
  62. return *this;
  63. }
  64. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  65. m_rng[idx] = rng;
  66. return *this;
  67. }
  68. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  69. m_fmt[idx] = fmt;
  70. return *this;
  71. }
  72. BenchmarkerBase& set_tensors_constraint(
  73. const TensorsConstriant& tensor_constraint) {
  74. m_tensor_constraint = tensor_constraint;
  75. return *this;
  76. }
  77. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  78. TensorLayoutArray layouts(shapes.size());
  79. for (size_t i = 0; i < shapes.size(); ++i) {
  80. DType dt = (m_dtype.find(i) != m_dtype.end() ? m_dtype[i]
  81. : dtype::Float32());
  82. if (m_fmt.find(i) == m_fmt.end()) {
  83. layouts[i] = TensorLayout(shapes[i], dt);
  84. layouts[i].init_contiguous_stride();
  85. } else
  86. layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
  87. }
  88. return layouts;
  89. }
  90. BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
  91. m_proxy.reset(nullptr);
  92. m_proxy = std::move(proxy);
  93. return *this;
  94. }
  95. std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
  96. BenchmarkerBase& set_times(size_t times) {
  97. m_times = times;
  98. return *this;
  99. }
  100. BenchmarkerBase& set_display(bool display) {
  101. m_display = display;
  102. return *this;
  103. }
  104. //! set a callback to be invoked before executing the operator
  105. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  106. m_before_exec_callback = cb;
  107. return *this;
  108. }
  109. /*!
  110. * \brief set adaptive benchmarking: ignore set_times() and find
  111. * suitable times to run for given duration;
  112. *
  113. * Note: the value returned by exec() would be average time per run,
  114. * rather than total elapsed time, if this is enabled.
  115. */
  116. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  117. m_adaptive_secs = tot_time_in_secs;
  118. return *this;
  119. }
  120. //! get the opr impl so setting other than param() can be modified
  121. Opr* opr() {
  122. if (!m_opr) {
  123. m_opr = m_handle->create_operator<Opr>();
  124. }
  125. return m_opr.get();
  126. }
  127. const Param& param() const { return m_param; }
  128. private:
  129. T m_timer;
  130. bool m_display = true;
  131. size_t m_times = 1;
  132. float m_adaptive_secs = 0;
  133. std::unique_ptr<Handle> m_handle_naive;
  134. Handle* m_handle;
  135. std::unique_ptr<RNG> m_default_rng;
  136. std::map<size_t, RNG*> m_rng;
  137. std::map<size_t, DType> m_dtype;
  138. std::map<size_t, TensorFormat> m_fmt;
  139. Param m_param;
  140. std::unique_ptr<OprProxy<Opr>> m_proxy;
  141. BeforeExecCallback m_before_exec_callback;
  142. std::unique_ptr<Opr> m_opr;
  143. TensorsConstriant m_tensor_constraint;
  144. };
  145. template <typename Opr, typename T>
  146. float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
  147. auto opr = this->opr();
  148. opr->param() = m_param;
  149. auto user_layouts = layouts;
  150. m_proxy->deduce_layout(opr, layouts);
  151. for (size_t i = 0; i < layouts.size(); ++i)
  152. if (user_layouts[i].ndim > 0) {
  153. auto run = [&]() {
  154. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  155. << "User provided shape is "
  156. << user_layouts[i].TensorShape::to_string()
  157. << "\nExpected shape is "
  158. << layouts[i].TensorShape::to_string();
  159. };
  160. run();
  161. }
  162. auto allocate = [&layouts](Handle* handle) {
  163. TensorNDArray tensors(layouts.size());
  164. auto trans_func = [handle](const TensorLayout& layout) {
  165. auto span = layout.span();
  166. TensorND res;
  167. res.raw_ptr = static_cast<uint8_t*>(
  168. megdnn_malloc(handle, span.dist_byte())) +
  169. span.low_byte;
  170. res.layout = layout;
  171. return res;
  172. };
  173. std::transform(layouts.begin(), layouts.end(), tensors.begin(),
  174. trans_func);
  175. return tensors;
  176. };
  177. auto tensors_cur = allocate(m_handle);
  178. auto tensors_cur_host = allocate(m_handle_naive.get());
  179. // init
  180. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  181. TensorND& tensor = tensors_cur_host[i];
  182. auto rng = m_rng[i];
  183. if (!rng)
  184. rng = m_default_rng.get();
  185. rng->gen(tensor);
  186. }
  187. if (m_tensor_constraint) {
  188. m_tensor_constraint(tensors_cur_host);
  189. }
  190. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  191. TensorND& tensor = tensors_cur_host[i];
  192. if (tensor.layout.ndim == 0)
  193. continue;
  194. auto size = tensor.layout.span().high_byte;
  195. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
  196. size);
  197. }
  198. if (m_before_exec_callback) {
  199. m_before_exec_callback(opr, tensors_cur);
  200. }
  201. // run
  202. // warm up
  203. m_proxy->exec(opr, tensors_cur);
  204. megcoreSynchronize(m_handle->megcore_computing_handle());
  205. if (m_adaptive_secs) {
  206. // find m_times for adaptive benchmarking
  207. m_times = 0;
  208. int cur_times = 1;
  209. auto remain_time = m_adaptive_secs * 1e6;
  210. while (remain_time > 0) {
  211. m_timer.reset();
  212. m_timer.start();
  213. for (int i = 0; i < cur_times; ++i)
  214. m_proxy->exec(opr, tensors_cur);
  215. megcoreSynchronize(m_handle->megcore_computing_handle());
  216. m_timer.stop();
  217. m_times += cur_times;
  218. auto this_run_time = m_timer.get_time_in_us();
  219. remain_time -= this_run_time;
  220. cur_times = std::min(
  221. cur_times * 2,
  222. std::max<int>(1, remain_time / this_run_time * cur_times));
  223. }
  224. }
  225. m_timer.reset();
  226. m_timer.start();
  227. for (size_t t = 0; t < m_times; ++t)
  228. m_proxy->exec(opr, tensors_cur);
  229. megcoreSynchronize(m_handle->megcore_computing_handle());
  230. m_timer.stop();
  231. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  232. if (m_display) {
  233. std::cout << "Total time is " << time_in_ms << "ms "
  234. << "for " << m_times << " run(s)." << std::endl;
  235. }
  236. auto free = [](Handle* handle, TensorNDArray& tensors) {
  237. std::for_each(tensors.begin(), tensors.end(),
  238. [handle](const TensorND& tensor) {
  239. megdnn_free(handle, tensor.raw_ptr);
  240. });
  241. };
  242. free(m_handle, tensors_cur);
  243. free(m_handle_naive.get(), tensors_cur_host);
  244. if (m_adaptive_secs)
  245. time_in_ms /= m_times;
  246. return time_in_ms;
  247. }
  248. template <typename Opr, typename T>
  249. float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
  250. auto opr = this->opr();
  251. opr->param() = m_param;
  252. TensorLayoutArray layouts;
  253. TensorNDArray tensors_cur_host;
  254. for (auto& inp : testcase_in) {
  255. layouts.push_back(inp.layout);
  256. tensors_cur_host.emplace_back(inp);
  257. }
  258. auto user_layouts = layouts;
  259. m_proxy->deduce_layout(opr, layouts);
  260. for (size_t i = 0; i < layouts.size(); ++i)
  261. if (user_layouts[i].ndim > 0) {
  262. auto run = [&]() {
  263. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  264. << "User provided shape is "
  265. << user_layouts[i].TensorShape::to_string()
  266. << "\nExpected shape is "
  267. << layouts[i].TensorShape::to_string();
  268. };
  269. run();
  270. }
  271. auto allocate = [&layouts](Handle* handle) {
  272. TensorNDArray tensors(layouts.size());
  273. auto trans_func = [handle](const TensorLayout& layout) {
  274. auto span = layout.span();
  275. TensorND res;
  276. res.raw_ptr = static_cast<uint8_t*>(
  277. megdnn_malloc(handle, span.dist_byte())) +
  278. span.low_byte;
  279. res.layout = layout;
  280. return res;
  281. };
  282. std::transform(layouts.begin(), layouts.end(), tensors.begin(),
  283. trans_func);
  284. return tensors;
  285. };
  286. auto tensors_cur = allocate(m_handle);
  287. //! init
  288. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  289. TensorND& tensor = tensors_cur_host[i];
  290. auto size = tensor.layout.span().high_byte;
  291. if (tensor.layout.ndim == 0)
  292. continue;
  293. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr, tensor.raw_ptr,
  294. size);
  295. }
  296. if (m_before_exec_callback) {
  297. m_before_exec_callback(opr, tensors_cur);
  298. }
  299. //! run
  300. //! warm up
  301. m_proxy->exec(opr, tensors_cur);
  302. megcoreSynchronize(m_handle->megcore_computing_handle());
  303. if (m_adaptive_secs) {
  304. //! find m_times for adaptive benchmarking
  305. m_times = 0;
  306. int cur_times = 1;
  307. auto remain_time = m_adaptive_secs * 1e6;
  308. while (remain_time > 0) {
  309. m_timer.reset();
  310. m_timer.start();
  311. for (int i = 0; i < cur_times; ++i)
  312. m_proxy->exec(opr, tensors_cur);
  313. megcoreSynchronize(m_handle->megcore_computing_handle());
  314. m_timer.stop();
  315. m_times += cur_times;
  316. auto this_run_time = m_timer.get_time_in_us();
  317. remain_time -= this_run_time;
  318. cur_times = std::min(
  319. cur_times * 2,
  320. std::max<int>(1, remain_time / this_run_time * cur_times));
  321. }
  322. }
  323. m_timer.reset();
  324. m_timer.start();
  325. for (size_t t = 0; t < m_times; ++t)
  326. m_proxy->exec(opr, tensors_cur);
  327. megcoreSynchronize(m_handle->megcore_computing_handle());
  328. m_timer.stop();
  329. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  330. if (m_display) {
  331. std::cout << "Total time is " << time_in_ms << "ms "
  332. << "for " << m_times << " run(s)." << std::endl;
  333. }
  334. auto free = [](Handle* handle, TensorNDArray& tensors) {
  335. std::for_each(tensors.begin(), tensors.end(),
  336. [handle](const TensorND& tensor) {
  337. megdnn_free(handle, tensor.raw_ptr);
  338. });
  339. };
  340. free(m_handle, tensors_cur);
  341. if (m_adaptive_secs)
  342. time_in_ms /= m_times;
  343. return time_in_ms;
  344. }
  345. template <typename Opr, typename T = Timer>
  346. class Benchmarker;
  347. template <typename Opr>
  348. class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
  349. public:
  350. Benchmarker(Handle* handle)
  351. : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
  352. };
  353. ////////////////// Algo Benchmark ////////////////////////
  354. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  355. float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
  356. const std::string& algo_base) {
  357. Proxy proxy;
  358. auto opr = benchmark.opr();
  359. opr->param() = benchmark.param();
  360. proxy.deduce_layout(opr, layouts);
  361. auto algos = OprAlgoProxy<Opr>::get_all_algorithms_info_safe(opr, layouts);
  362. float min_used = std::numeric_limits<float>::max();
  363. bool execed = false;
  364. for (auto i : algos) {
  365. if (std::regex_match(i.desc.name,
  366. std::regex("(" + algo_base + ")(.*)"))) {
  367. opr->execution_policy().algo = i.desc;
  368. auto used = benchmark.exec(layouts);
  369. min_used = std::min(min_used, used);
  370. printf("run algo: %s used: %f ms min_used: %f ms\n",
  371. i.desc.name.c_str(), used, min_used);
  372. execed = true;
  373. }
  374. }
  375. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  376. return min_used;
  377. }
  378. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  379. float algo_benchmark(Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
  380. const std::string& algo_base) {
  381. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  382. }
  383. } // namespace test
  384. } // namespace megdnn
  385. // vim: syntax=cpp.doxygen

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台