You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403
  1. /**
  2. * \file dnn/test/common/benchmarker.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #pragma once
  13. #include <map>
  14. #include <memory>
  15. #include <regex>
  16. #include <vector>
  17. #include "megdnn/basic_types.h"
  18. #include "megdnn/tensor_format.h"
  19. #include "test/common/opr_algo_proxy.h"
  20. #include "test/common/opr_proxy.h"
  21. #include "test/common/rng.h"
  22. #include "test/common/timer.h"
  23. namespace megdnn {
  24. namespace test {
  25. template <typename Opr, typename T>
  26. class BenchmarkerBase {
  27. public:
  28. using Param = typename Opr::Param;
  29. using TensorValueArray = TensorNDArray;
  30. using BeforeExecCallback = std::function<void(Opr*, const TensorValueArray&)>;
  31. using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
  32. BenchmarkerBase(Handle* handle, T timer)
  33. : m_timer(timer),
  34. m_handle_naive(create_cpu_handle(2, false)),
  35. m_handle(handle),
  36. m_default_rng(new NormalRNG()),
  37. m_param(Param()),
  38. m_proxy{new OprProxy<Opr>()} {}
  39. const Handle* handle() const { return m_handle; }
  40. /*!
  41. * \brief benchmark opr on current param/dtype/rng config
  42. * \returns elapsed time in ms
  43. *
  44. * Benchmarker would construct TensorLayout vectors from shapes and
  45. * dtypes and call exec(TensorLayoutArray &).
  46. */
  47. float exec(const TensorShapeArray& shapes) { return exec(make_layouts(shapes)); }
  48. float exec(TensorLayoutArray layouts);
  49. float exect(const TensorValueArray& testcase_in);
  50. //! disabiguate overloaded exec
  51. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  52. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  53. BenchmarkerBase& set_param(Param param) {
  54. m_param = param;
  55. return *this;
  56. }
  57. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  58. m_dtype[idx] = dtype;
  59. return *this;
  60. }
  61. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  62. m_rng[idx] = rng;
  63. return *this;
  64. }
  65. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  66. m_fmt[idx] = fmt;
  67. return *this;
  68. }
  69. BenchmarkerBase& set_tensors_constraint(
  70. const TensorsConstriant& tensor_constraint) {
  71. m_tensor_constraint = tensor_constraint;
  72. return *this;
  73. }
  74. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  75. TensorLayoutArray layouts(shapes.size());
  76. for (size_t i = 0; i < shapes.size(); ++i) {
  77. DType dt =
  78. (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] : dtype::Float32());
  79. if (m_fmt.find(i) == m_fmt.end()) {
  80. layouts[i] = TensorLayout(shapes[i], dt);
  81. layouts[i].init_contiguous_stride();
  82. } else
  83. layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
  84. }
  85. return layouts;
  86. }
  87. BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
  88. m_proxy.reset(nullptr);
  89. m_proxy = std::move(proxy);
  90. return *this;
  91. }
  92. std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
  93. BenchmarkerBase& set_times(size_t times) {
  94. m_times = times;
  95. return *this;
  96. }
  97. BenchmarkerBase& set_display(bool display) {
  98. m_display = display;
  99. return *this;
  100. }
  101. //! set a callback to be invoked before executing the operator
  102. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  103. m_before_exec_callback = cb;
  104. return *this;
  105. }
  106. /*!
  107. * \brief set adaptive benchmarking: ignore set_times() and find
  108. * suitable times to run for given duration;
  109. *
  110. * Note: the value returned by exec() would be average time per run,
  111. * rather than total elapsed time, if this is enabled.
  112. */
  113. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  114. m_adaptive_secs = tot_time_in_secs;
  115. return *this;
  116. }
  117. //! get the opr impl so setting other than param() can be modified
  118. Opr* opr() {
  119. if (!m_opr) {
  120. m_opr = m_handle->create_operator<Opr>();
  121. }
  122. return m_opr.get();
  123. }
  124. const Param& param() const { return m_param; }
  125. private:
  126. T m_timer;
  127. bool m_display = true;
  128. size_t m_times = 1;
  129. float m_adaptive_secs = 0;
  130. std::unique_ptr<Handle> m_handle_naive;
  131. Handle* m_handle;
  132. std::unique_ptr<RNG> m_default_rng;
  133. std::map<size_t, RNG*> m_rng;
  134. std::map<size_t, DType> m_dtype;
  135. std::map<size_t, TensorFormat> m_fmt;
  136. Param m_param;
  137. std::unique_ptr<OprProxy<Opr>> m_proxy;
  138. BeforeExecCallback m_before_exec_callback;
  139. std::unique_ptr<Opr> m_opr;
  140. TensorsConstriant m_tensor_constraint;
  141. };
  142. template <typename Opr, typename T>
  143. float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
  144. auto opr = this->opr();
  145. opr->param() = m_param;
  146. auto user_layouts = layouts;
  147. m_proxy->deduce_layout(opr, layouts);
  148. for (size_t i = 0; i < layouts.size(); ++i) {
  149. if (user_layouts[i].ndim > 0) {
  150. auto run = [&]() {
  151. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  152. << "User provided shape is "
  153. << user_layouts[i].TensorShape::to_string()
  154. << "\nExpected shape is "
  155. << layouts[i].TensorShape::to_string();
  156. };
  157. run();
  158. }
  159. }
  160. auto allocate = [&layouts](Handle* handle) {
  161. TensorNDArray tensors(layouts.size());
  162. auto trans_func = [handle](const TensorLayout& layout) {
  163. auto span = layout.span();
  164. TensorND res;
  165. res.reset_ptr(
  166. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) -
  167. span.low_byte);
  168. res.layout = layout;
  169. return res;
  170. };
  171. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  172. return tensors;
  173. };
  174. auto tensors_cur = allocate(m_handle);
  175. auto tensors_cur_host = allocate(m_handle_naive.get());
  176. // init
  177. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  178. TensorND& tensor = tensors_cur_host[i];
  179. auto rng = m_rng[i];
  180. if (!rng)
  181. rng = m_default_rng.get();
  182. rng->gen(tensor);
  183. }
  184. if (m_tensor_constraint) {
  185. m_tensor_constraint(tensors_cur_host);
  186. }
  187. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  188. TensorND& tensor = tensors_cur_host[i];
  189. if (tensor.layout.ndim == 0)
  190. continue;
  191. auto size = tensor.layout.span().high_byte;
  192. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  193. }
  194. if (m_before_exec_callback) {
  195. m_before_exec_callback(opr, tensors_cur);
  196. }
  197. // run
  198. // warm up
  199. m_proxy->exec(opr, tensors_cur);
  200. megcoreSynchronize(m_handle->megcore_computing_handle());
  201. if (m_adaptive_secs) {
  202. // find m_times for adaptive benchmarking
  203. m_times = 0;
  204. int cur_times = 1;
  205. auto remain_time = m_adaptive_secs * 1e6;
  206. while (remain_time > 0) {
  207. m_timer.reset();
  208. m_timer.start();
  209. for (int i = 0; i < cur_times; ++i)
  210. m_proxy->exec(opr, tensors_cur);
  211. megcoreSynchronize(m_handle->megcore_computing_handle());
  212. m_timer.stop();
  213. m_times += cur_times;
  214. auto this_run_time = m_timer.get_time_in_us();
  215. remain_time -= this_run_time;
  216. cur_times = std::min(
  217. cur_times * 2,
  218. std::max<int>(1, remain_time / this_run_time * cur_times));
  219. }
  220. }
  221. m_timer.reset();
  222. m_timer.start();
  223. for (size_t t = 0; t < m_times; ++t)
  224. m_proxy->exec(opr, tensors_cur);
  225. megcoreSynchronize(m_handle->megcore_computing_handle());
  226. m_timer.stop();
  227. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  228. if (m_display) {
  229. std::cout << "Total time is " << time_in_ms << "ms "
  230. << "for " << m_times << " run(s)." << std::endl;
  231. }
  232. auto free = [](Handle* handle, TensorNDArray& tensors) {
  233. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  234. megdnn_free(
  235. handle, static_cast<dt_byte*>(tensor.raw_ptr()) +
  236. tensor.layout.span().low_byte);
  237. });
  238. };
  239. free(m_handle, tensors_cur);
  240. free(m_handle_naive.get(), tensors_cur_host);
  241. if (m_adaptive_secs)
  242. time_in_ms /= m_times;
  243. return time_in_ms;
  244. }
  245. template <typename Opr, typename T>
  246. float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
  247. auto opr = this->opr();
  248. opr->param() = m_param;
  249. TensorLayoutArray layouts;
  250. TensorNDArray tensors_cur_host;
  251. for (auto& inp : testcase_in) {
  252. layouts.push_back(inp.layout);
  253. tensors_cur_host.emplace_back(inp);
  254. }
  255. auto user_layouts = layouts;
  256. m_proxy->deduce_layout(opr, layouts);
  257. for (size_t i = 0; i < layouts.size(); ++i)
  258. if (user_layouts[i].ndim > 0) {
  259. auto run = [&]() {
  260. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  261. << "User provided shape is "
  262. << user_layouts[i].TensorShape::to_string()
  263. << "\nExpected shape is "
  264. << layouts[i].TensorShape::to_string();
  265. };
  266. run();
  267. }
  268. auto allocate = [&layouts](Handle* handle) {
  269. TensorNDArray tensors(layouts.size());
  270. auto trans_func = [handle](const TensorLayout& layout) {
  271. auto span = layout.span();
  272. TensorND res;
  273. res.reset_ptr(
  274. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) -
  275. span.low_byte);
  276. res.layout = layout;
  277. return res;
  278. };
  279. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  280. return tensors;
  281. };
  282. auto tensors_cur = allocate(m_handle);
  283. //! init
  284. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  285. TensorND& tensor = tensors_cur_host[i];
  286. auto size = tensor.layout.span().high_byte;
  287. if (tensor.layout.ndim == 0)
  288. continue;
  289. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  290. }
  291. if (m_before_exec_callback) {
  292. m_before_exec_callback(opr, tensors_cur);
  293. }
  294. //! run
  295. //! warm up
  296. m_proxy->exec(opr, tensors_cur);
  297. megcoreSynchronize(m_handle->megcore_computing_handle());
  298. if (m_adaptive_secs) {
  299. //! find m_times for adaptive benchmarking
  300. m_times = 0;
  301. int cur_times = 1;
  302. auto remain_time = m_adaptive_secs * 1e6;
  303. while (remain_time > 0) {
  304. m_timer.reset();
  305. m_timer.start();
  306. for (int i = 0; i < cur_times; ++i)
  307. m_proxy->exec(opr, tensors_cur);
  308. megcoreSynchronize(m_handle->megcore_computing_handle());
  309. m_timer.stop();
  310. m_times += cur_times;
  311. auto this_run_time = m_timer.get_time_in_us();
  312. remain_time -= this_run_time;
  313. cur_times = std::min(
  314. cur_times * 2,
  315. std::max<int>(1, remain_time / this_run_time * cur_times));
  316. }
  317. }
  318. m_timer.reset();
  319. m_timer.start();
  320. for (size_t t = 0; t < m_times; ++t)
  321. m_proxy->exec(opr, tensors_cur);
  322. megcoreSynchronize(m_handle->megcore_computing_handle());
  323. m_timer.stop();
  324. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  325. if (m_display) {
  326. std::cout << "Total time is " << time_in_ms << "ms "
  327. << "for " << m_times << " run(s)." << std::endl;
  328. }
  329. auto free = [](Handle* handle, TensorNDArray& tensors) {
  330. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  331. megdnn_free(
  332. handle, static_cast<dt_byte*>(tensor.raw_ptr()) +
  333. tensor.layout.span().low_byte);
  334. });
  335. };
  336. free(m_handle, tensors_cur);
  337. if (m_adaptive_secs)
  338. time_in_ms /= m_times;
  339. return time_in_ms;
  340. }
  341. template <typename Opr, typename T = Timer>
  342. class Benchmarker;
  343. template <typename Opr>
  344. class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
  345. public:
  346. Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
  347. };
  348. ////////////////// Algo Benchmark ////////////////////////
  349. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  350. float algo_benchmark(
  351. Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
  352. const std::string& algo_base) {
  353. Proxy proxy;
  354. auto opr = benchmark.opr();
  355. opr->param() = benchmark.param();
  356. proxy.deduce_layout(opr, layouts);
  357. auto algos = OprAlgoProxy<Opr>::get_all_algorithms_info_safe(opr, layouts);
  358. float min_used = std::numeric_limits<float>::max();
  359. bool execed = false;
  360. for (auto i : algos) {
  361. if (std::regex_match(i.desc.name, std::regex("(" + algo_base + ")(.*)"))) {
  362. opr->execution_policy().algo = i.desc;
  363. auto used = benchmark.exec(layouts);
  364. min_used = std::min(min_used, used);
  365. printf("run algo: %s used: %f ms min_used: %f ms\n", i.desc.name.c_str(),
  366. used, min_used);
  367. execed = true;
  368. }
  369. }
  370. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  371. return min_used;
  372. }
  373. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  374. float algo_benchmark(
  375. Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
  376. const std::string& algo_base) {
  377. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  378. }
  379. } // namespace test
  380. } // namespace megdnn
  381. // vim: syntax=cpp.doxygen