You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398
  1. /**
  2. * \file dnn/test/common/benchmarker.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <map>
  14. #include <memory>
  15. #include <regex>
  16. #include <vector>
  17. #include "megdnn/basic_types.h"
  18. #include "megdnn/tensor_format.h"
  19. #include "test/common/opr_algo_proxy.h"
  20. #include "test/common/opr_proxy.h"
  21. #include "test/common/rng.h"
  22. #include "test/common/timer.h"
  23. namespace megdnn {
  24. namespace test {
  25. template <typename Opr, typename T>
  26. class BenchmarkerBase {
  27. public:
  28. using Param = typename Opr::Param;
  29. using TensorValueArray = TensorNDArray;
  30. using BeforeExecCallback = std::function<void(Opr*, const TensorValueArray&)>;
  31. using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
  32. BenchmarkerBase(Handle* handle, T timer)
  33. : m_timer(timer),
  34. m_handle_naive(create_cpu_handle(2, false)),
  35. m_handle(handle),
  36. m_default_rng(new NormalRNG()),
  37. m_param(Param()),
  38. m_proxy{new OprProxy<Opr>()} {}
  39. const Handle* handle() const { return m_handle; }
  40. /*!
  41. * \brief benchmark opr on current param/dtype/rng config
  42. * \returns elapsed time in ms
  43. *
  44. * Benchmarker would construct TensorLayout vectors from shapes and
  45. * dtypes and call exec(TensorLayoutArray &).
  46. */
  47. float exec(const TensorShapeArray& shapes) { return exec(make_layouts(shapes)); }
  48. float exec(TensorLayoutArray layouts);
  49. float exect(const TensorValueArray& testcase_in);
  50. //! disabiguate overloaded exec
  51. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  52. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  53. BenchmarkerBase& set_param(Param param) {
  54. m_param = param;
  55. return *this;
  56. }
  57. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  58. m_dtype[idx] = dtype;
  59. return *this;
  60. }
  61. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  62. m_rng[idx] = rng;
  63. return *this;
  64. }
  65. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  66. m_fmt[idx] = fmt;
  67. return *this;
  68. }
  69. BenchmarkerBase& set_tensors_constraint(
  70. const TensorsConstriant& tensor_constraint) {
  71. m_tensor_constraint = tensor_constraint;
  72. return *this;
  73. }
  74. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  75. TensorLayoutArray layouts(shapes.size());
  76. for (size_t i = 0; i < shapes.size(); ++i) {
  77. DType dt =
  78. (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] : dtype::Float32());
  79. if (m_fmt.find(i) == m_fmt.end()) {
  80. layouts[i] = TensorLayout(shapes[i], dt);
  81. layouts[i].init_contiguous_stride();
  82. } else
  83. layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
  84. }
  85. return layouts;
  86. }
  87. BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
  88. m_proxy.reset(nullptr);
  89. m_proxy = std::move(proxy);
  90. return *this;
  91. }
  92. std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
  93. BenchmarkerBase& set_times(size_t times) {
  94. m_times = times;
  95. return *this;
  96. }
  97. BenchmarkerBase& set_display(bool display) {
  98. m_display = display;
  99. return *this;
  100. }
  101. //! set a callback to be invoked before executing the operator
  102. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  103. m_before_exec_callback = cb;
  104. return *this;
  105. }
  106. /*!
  107. * \brief set adaptive benchmarking: ignore set_times() and find
  108. * suitable times to run for given duration;
  109. *
  110. * Note: the value returned by exec() would be average time per run,
  111. * rather than total elapsed time, if this is enabled.
  112. */
  113. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  114. m_adaptive_secs = tot_time_in_secs;
  115. return *this;
  116. }
  117. //! get the opr impl so setting other than param() can be modified
  118. Opr* opr() {
  119. if (!m_opr) {
  120. m_opr = m_handle->create_operator<Opr>();
  121. }
  122. return m_opr.get();
  123. }
  124. const Param& param() const { return m_param; }
  125. private:
  126. T m_timer;
  127. bool m_display = true;
  128. size_t m_times = 1;
  129. float m_adaptive_secs = 0;
  130. std::unique_ptr<Handle> m_handle_naive;
  131. Handle* m_handle;
  132. std::unique_ptr<RNG> m_default_rng;
  133. std::map<size_t, RNG*> m_rng;
  134. std::map<size_t, DType> m_dtype;
  135. std::map<size_t, TensorFormat> m_fmt;
  136. Param m_param;
  137. std::unique_ptr<OprProxy<Opr>> m_proxy;
  138. BeforeExecCallback m_before_exec_callback;
  139. std::unique_ptr<Opr> m_opr;
  140. TensorsConstriant m_tensor_constraint;
  141. };
  142. template <typename Opr, typename T>
  143. float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
  144. auto opr = this->opr();
  145. opr->param() = m_param;
  146. auto user_layouts = layouts;
  147. m_proxy->deduce_layout(opr, layouts);
  148. for (size_t i = 0; i < layouts.size(); ++i)
  149. if (user_layouts[i].ndim > 0) {
  150. auto run = [&]() {
  151. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  152. << "User provided shape is "
  153. << user_layouts[i].TensorShape::to_string()
  154. << "\nExpected shape is "
  155. << layouts[i].TensorShape::to_string();
  156. };
  157. run();
  158. }
  159. auto allocate = [&layouts](Handle* handle) {
  160. TensorNDArray tensors(layouts.size());
  161. auto trans_func = [handle](const TensorLayout& layout) {
  162. auto span = layout.span();
  163. TensorND res;
  164. res.reset_ptr(
  165. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) +
  166. span.low_byte);
  167. res.layout = layout;
  168. return res;
  169. };
  170. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  171. return tensors;
  172. };
  173. auto tensors_cur = allocate(m_handle);
  174. auto tensors_cur_host = allocate(m_handle_naive.get());
  175. // init
  176. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  177. TensorND& tensor = tensors_cur_host[i];
  178. auto rng = m_rng[i];
  179. if (!rng)
  180. rng = m_default_rng.get();
  181. rng->gen(tensor);
  182. }
  183. if (m_tensor_constraint) {
  184. m_tensor_constraint(tensors_cur_host);
  185. }
  186. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  187. TensorND& tensor = tensors_cur_host[i];
  188. if (tensor.layout.ndim == 0)
  189. continue;
  190. auto size = tensor.layout.span().high_byte;
  191. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  192. }
  193. if (m_before_exec_callback) {
  194. m_before_exec_callback(opr, tensors_cur);
  195. }
  196. // run
  197. // warm up
  198. m_proxy->exec(opr, tensors_cur);
  199. megcoreSynchronize(m_handle->megcore_computing_handle());
  200. if (m_adaptive_secs) {
  201. // find m_times for adaptive benchmarking
  202. m_times = 0;
  203. int cur_times = 1;
  204. auto remain_time = m_adaptive_secs * 1e6;
  205. while (remain_time > 0) {
  206. m_timer.reset();
  207. m_timer.start();
  208. for (int i = 0; i < cur_times; ++i)
  209. m_proxy->exec(opr, tensors_cur);
  210. megcoreSynchronize(m_handle->megcore_computing_handle());
  211. m_timer.stop();
  212. m_times += cur_times;
  213. auto this_run_time = m_timer.get_time_in_us();
  214. remain_time -= this_run_time;
  215. cur_times = std::min(
  216. cur_times * 2,
  217. std::max<int>(1, remain_time / this_run_time * cur_times));
  218. }
  219. }
  220. m_timer.reset();
  221. m_timer.start();
  222. for (size_t t = 0; t < m_times; ++t)
  223. m_proxy->exec(opr, tensors_cur);
  224. megcoreSynchronize(m_handle->megcore_computing_handle());
  225. m_timer.stop();
  226. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  227. if (m_display) {
  228. std::cout << "Total time is " << time_in_ms << "ms "
  229. << "for " << m_times << " run(s)." << std::endl;
  230. }
  231. auto free = [](Handle* handle, TensorNDArray& tensors) {
  232. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  233. megdnn_free(handle, tensor.raw_ptr());
  234. });
  235. };
  236. free(m_handle, tensors_cur);
  237. free(m_handle_naive.get(), tensors_cur_host);
  238. if (m_adaptive_secs)
  239. time_in_ms /= m_times;
  240. return time_in_ms;
  241. }
  242. template <typename Opr, typename T>
  243. float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
  244. auto opr = this->opr();
  245. opr->param() = m_param;
  246. TensorLayoutArray layouts;
  247. TensorNDArray tensors_cur_host;
  248. for (auto& inp : testcase_in) {
  249. layouts.push_back(inp.layout);
  250. tensors_cur_host.emplace_back(inp);
  251. }
  252. auto user_layouts = layouts;
  253. m_proxy->deduce_layout(opr, layouts);
  254. for (size_t i = 0; i < layouts.size(); ++i)
  255. if (user_layouts[i].ndim > 0) {
  256. auto run = [&]() {
  257. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  258. << "User provided shape is "
  259. << user_layouts[i].TensorShape::to_string()
  260. << "\nExpected shape is "
  261. << layouts[i].TensorShape::to_string();
  262. };
  263. run();
  264. }
  265. auto allocate = [&layouts](Handle* handle) {
  266. TensorNDArray tensors(layouts.size());
  267. auto trans_func = [handle](const TensorLayout& layout) {
  268. auto span = layout.span();
  269. TensorND res;
  270. res.reset_ptr(
  271. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) +
  272. span.low_byte);
  273. res.layout = layout;
  274. return res;
  275. };
  276. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  277. return tensors;
  278. };
  279. auto tensors_cur = allocate(m_handle);
  280. //! init
  281. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  282. TensorND& tensor = tensors_cur_host[i];
  283. auto size = tensor.layout.span().high_byte;
  284. if (tensor.layout.ndim == 0)
  285. continue;
  286. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  287. }
  288. if (m_before_exec_callback) {
  289. m_before_exec_callback(opr, tensors_cur);
  290. }
  291. //! run
  292. //! warm up
  293. m_proxy->exec(opr, tensors_cur);
  294. megcoreSynchronize(m_handle->megcore_computing_handle());
  295. if (m_adaptive_secs) {
  296. //! find m_times for adaptive benchmarking
  297. m_times = 0;
  298. int cur_times = 1;
  299. auto remain_time = m_adaptive_secs * 1e6;
  300. while (remain_time > 0) {
  301. m_timer.reset();
  302. m_timer.start();
  303. for (int i = 0; i < cur_times; ++i)
  304. m_proxy->exec(opr, tensors_cur);
  305. megcoreSynchronize(m_handle->megcore_computing_handle());
  306. m_timer.stop();
  307. m_times += cur_times;
  308. auto this_run_time = m_timer.get_time_in_us();
  309. remain_time -= this_run_time;
  310. cur_times = std::min(
  311. cur_times * 2,
  312. std::max<int>(1, remain_time / this_run_time * cur_times));
  313. }
  314. }
  315. m_timer.reset();
  316. m_timer.start();
  317. for (size_t t = 0; t < m_times; ++t)
  318. m_proxy->exec(opr, tensors_cur);
  319. megcoreSynchronize(m_handle->megcore_computing_handle());
  320. m_timer.stop();
  321. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  322. if (m_display) {
  323. std::cout << "Total time is " << time_in_ms << "ms "
  324. << "for " << m_times << " run(s)." << std::endl;
  325. }
  326. auto free = [](Handle* handle, TensorNDArray& tensors) {
  327. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  328. megdnn_free(handle, tensor.raw_ptr());
  329. });
  330. };
  331. free(m_handle, tensors_cur);
  332. if (m_adaptive_secs)
  333. time_in_ms /= m_times;
  334. return time_in_ms;
  335. }
  336. template <typename Opr, typename T = Timer>
  337. class Benchmarker;
  338. template <typename Opr>
  339. class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
  340. public:
  341. Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
  342. };
  343. ////////////////// Algo Benchmark ////////////////////////
  344. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  345. float algo_benchmark(
  346. Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
  347. const std::string& algo_base) {
  348. Proxy proxy;
  349. auto opr = benchmark.opr();
  350. opr->param() = benchmark.param();
  351. proxy.deduce_layout(opr, layouts);
  352. auto algos = OprAlgoProxy<Opr>::get_all_algorithms_info_safe(opr, layouts);
  353. float min_used = std::numeric_limits<float>::max();
  354. bool execed = false;
  355. for (auto i : algos) {
  356. if (std::regex_match(i.desc.name, std::regex("(" + algo_base + ")(.*)"))) {
  357. opr->execution_policy().algo = i.desc;
  358. auto used = benchmark.exec(layouts);
  359. min_used = std::min(min_used, used);
  360. printf("run algo: %s used: %f ms min_used: %f ms\n", i.desc.name.c_str(),
  361. used, min_used);
  362. execed = true;
  363. }
  364. }
  365. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  366. return min_used;
  367. }
  368. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  369. float algo_benchmark(
  370. Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
  371. const std::string& algo_base) {
  372. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  373. }
  374. } // namespace test
  375. } // namespace megdnn
  376. // vim: syntax=cpp.doxygen