You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393
  1. #pragma once
  2. #include <map>
  3. #include <memory>
  4. #include <regex>
  5. #include <vector>
  6. #include "megdnn/basic_types.h"
  7. #include "megdnn/tensor_format.h"
  8. #include "test/common/opr_algo_proxy.h"
  9. #include "test/common/opr_proxy.h"
  10. #include "test/common/rng.h"
  11. #include "test/common/timer.h"
  12. namespace megdnn {
  13. namespace test {
  14. template <typename Opr, typename T, typename Proxy = OprProxy<Opr>>
  15. class BenchmarkerBase {
  16. public:
  17. using Param = typename Opr::Param;
  18. using TensorValueArray = TensorNDArray;
  19. using BeforeExecCallback = std::function<void(Opr*, const TensorValueArray&)>;
  20. using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
  21. BenchmarkerBase(Handle* handle, T timer)
  22. : m_timer(timer),
  23. m_handle_naive(create_cpu_handle(2, false)),
  24. m_handle(handle),
  25. m_default_rng(new NormalRNG()),
  26. m_param(Param()),
  27. m_proxy{new Proxy()} {}
  28. const Handle* handle() const { return m_handle; }
  29. /*!
  30. * \brief benchmark opr on current param/dtype/rng config
  31. * \returns elapsed time in ms
  32. *
  33. * Benchmarker would construct TensorLayout vectors from shapes and
  34. * dtypes and call exec(TensorLayoutArray &).
  35. */
  36. float exec(const TensorShapeArray& shapes) { return exec(make_layouts(shapes)); }
  37. float exec(TensorLayoutArray layouts);
  38. float exect(const TensorValueArray& testcase_in);
  39. //! disabiguate overloaded exec
  40. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  41. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  42. BenchmarkerBase& set_param(Param param) {
  43. m_param = param;
  44. return *this;
  45. }
  46. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  47. m_dtype[idx] = dtype;
  48. return *this;
  49. }
  50. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  51. m_rng[idx] = rng;
  52. return *this;
  53. }
  54. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  55. m_fmt[idx] = fmt;
  56. return *this;
  57. }
  58. BenchmarkerBase& set_tensors_constraint(
  59. const TensorsConstriant& tensor_constraint) {
  60. m_tensor_constraint = tensor_constraint;
  61. return *this;
  62. }
  63. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  64. TensorLayoutArray layouts(shapes.size());
  65. for (size_t i = 0; i < shapes.size(); ++i) {
  66. DType dt =
  67. (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] : dtype::Float32());
  68. if (m_fmt.find(i) == m_fmt.end()) {
  69. layouts[i] = TensorLayout(shapes[i], dt);
  70. layouts[i].init_contiguous_stride();
  71. } else
  72. layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
  73. }
  74. return layouts;
  75. }
  76. BenchmarkerBase& set_proxy(std::unique_ptr<Proxy>& proxy) {
  77. m_proxy.reset(nullptr);
  78. m_proxy = std::move(proxy);
  79. return *this;
  80. }
  81. std::unique_ptr<Proxy>& proxy() { return m_proxy; }
  82. BenchmarkerBase& set_times(size_t times) {
  83. m_times = times;
  84. return *this;
  85. }
  86. BenchmarkerBase& set_display(bool display) {
  87. m_display = display;
  88. return *this;
  89. }
  90. //! set a callback to be invoked before executing the operator
  91. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  92. m_before_exec_callback = cb;
  93. return *this;
  94. }
  95. /*!
  96. * \brief set adaptive benchmarking: ignore set_times() and find
  97. * suitable times to run for given duration;
  98. *
  99. * Note: the value returned by exec() would be average time per run,
  100. * rather than total elapsed time, if this is enabled.
  101. */
  102. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  103. m_adaptive_secs = tot_time_in_secs;
  104. return *this;
  105. }
  106. //! get the opr impl so setting other than param() can be modified
  107. Opr* opr() {
  108. if (!m_opr) {
  109. m_opr = m_handle->create_operator<Opr>();
  110. }
  111. return m_opr.get();
  112. }
  113. const Param& param() const { return m_param; }
  114. private:
  115. T m_timer;
  116. bool m_display = true;
  117. size_t m_times = 1;
  118. float m_adaptive_secs = 0;
  119. std::unique_ptr<Handle> m_handle_naive;
  120. Handle* m_handle;
  121. std::unique_ptr<RNG> m_default_rng;
  122. std::map<size_t, RNG*> m_rng;
  123. std::map<size_t, DType> m_dtype;
  124. std::map<size_t, TensorFormat> m_fmt;
  125. Param m_param;
  126. std::unique_ptr<Proxy> m_proxy;
  127. BeforeExecCallback m_before_exec_callback;
  128. std::unique_ptr<Opr> m_opr;
  129. TensorsConstriant m_tensor_constraint;
  130. };
  131. template <typename Opr, typename T, typename OprProxy>
  132. float BenchmarkerBase<Opr, T, OprProxy>::exec(TensorLayoutArray layouts) {
  133. auto opr = this->opr();
  134. opr->param() = m_param;
  135. auto user_layouts = layouts;
  136. m_proxy->deduce_layout(opr, layouts);
  137. for (size_t i = 0; i < layouts.size(); ++i) {
  138. if (user_layouts[i].ndim > 0) {
  139. auto run = [&]() {
  140. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  141. << "User provided shape is "
  142. << user_layouts[i].TensorShape::to_string()
  143. << "\nExpected shape is "
  144. << layouts[i].TensorShape::to_string();
  145. };
  146. run();
  147. }
  148. }
  149. auto allocate = [&layouts](Handle* handle) {
  150. TensorNDArray tensors(layouts.size());
  151. auto trans_func = [handle](const TensorLayout& layout) {
  152. auto span = layout.span();
  153. TensorND res;
  154. res.reset_ptr(
  155. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) -
  156. span.low_byte);
  157. res.layout = layout;
  158. return res;
  159. };
  160. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  161. return tensors;
  162. };
  163. auto tensors_cur = allocate(m_handle);
  164. auto tensors_cur_host = allocate(m_handle_naive.get());
  165. // init
  166. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  167. TensorND& tensor = tensors_cur_host[i];
  168. auto rng = m_rng[i];
  169. if (!rng)
  170. rng = m_default_rng.get();
  171. rng->gen(tensor);
  172. }
  173. if (m_tensor_constraint) {
  174. m_tensor_constraint(tensors_cur_host);
  175. }
  176. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  177. TensorND& tensor = tensors_cur_host[i];
  178. if (tensor.layout.ndim == 0)
  179. continue;
  180. auto size = tensor.layout.span().high_byte;
  181. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  182. }
  183. if (m_before_exec_callback) {
  184. m_before_exec_callback(opr, tensors_cur);
  185. }
  186. //! init weights
  187. m_proxy->init(opr, tensors_cur);
  188. // run
  189. // warm up
  190. m_proxy->exec(opr, tensors_cur);
  191. megcoreSynchronize(m_handle->megcore_computing_handle());
  192. if (m_adaptive_secs) {
  193. // find m_times for adaptive benchmarking
  194. m_times = 0;
  195. int cur_times = 1;
  196. auto remain_time = m_adaptive_secs * 1e6;
  197. while (remain_time > 0) {
  198. m_timer.reset();
  199. m_timer.start();
  200. for (int i = 0; i < cur_times; ++i)
  201. m_proxy->exec(opr, tensors_cur);
  202. megcoreSynchronize(m_handle->megcore_computing_handle());
  203. m_timer.stop();
  204. m_times += cur_times;
  205. auto this_run_time = m_timer.get_time_in_us();
  206. remain_time -= this_run_time;
  207. cur_times = std::min(
  208. cur_times * 2,
  209. std::max<int>(1, remain_time / this_run_time * cur_times));
  210. }
  211. }
  212. m_timer.reset();
  213. m_timer.start();
  214. for (size_t t = 0; t < m_times; ++t)
  215. m_proxy->exec(opr, tensors_cur);
  216. megcoreSynchronize(m_handle->megcore_computing_handle());
  217. m_timer.stop();
  218. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  219. if (m_display) {
  220. std::cout << "Total time is " << time_in_ms << "ms "
  221. << "for " << m_times << " run(s)." << std::endl;
  222. }
  223. auto free = [](Handle* handle, TensorNDArray& tensors) {
  224. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  225. megdnn_free(
  226. handle, static_cast<dt_byte*>(tensor.raw_ptr()) +
  227. tensor.layout.span().low_byte);
  228. });
  229. };
  230. free(m_handle, tensors_cur);
  231. free(m_handle_naive.get(), tensors_cur_host);
  232. if (m_adaptive_secs)
  233. time_in_ms /= m_times;
  234. return time_in_ms;
  235. }
  236. template <typename Opr, typename T, typename Proxy>
  237. float BenchmarkerBase<Opr, T, Proxy>::exect(const TensorValueArray& testcase_in) {
  238. auto opr = this->opr();
  239. opr->param() = m_param;
  240. TensorLayoutArray layouts;
  241. TensorNDArray tensors_cur_host;
  242. for (auto& inp : testcase_in) {
  243. layouts.push_back(inp.layout);
  244. tensors_cur_host.emplace_back(inp);
  245. }
  246. auto user_layouts = layouts;
  247. m_proxy->deduce_layout(opr, layouts);
  248. for (size_t i = 0; i < layouts.size(); ++i)
  249. if (user_layouts[i].ndim > 0) {
  250. auto run = [&]() {
  251. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  252. << "User provided shape is "
  253. << user_layouts[i].TensorShape::to_string()
  254. << "\nExpected shape is "
  255. << layouts[i].TensorShape::to_string();
  256. };
  257. run();
  258. }
  259. auto allocate = [&layouts](Handle* handle) {
  260. TensorNDArray tensors(layouts.size());
  261. auto trans_func = [handle](const TensorLayout& layout) {
  262. auto span = layout.span();
  263. TensorND res;
  264. res.reset_ptr(
  265. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) -
  266. span.low_byte);
  267. res.layout = layout;
  268. return res;
  269. };
  270. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  271. return tensors;
  272. };
  273. auto tensors_cur = allocate(m_handle);
  274. //! init
  275. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  276. TensorND& tensor = tensors_cur_host[i];
  277. auto size = tensor.layout.span().high_byte;
  278. if (tensor.layout.ndim == 0)
  279. continue;
  280. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  281. }
  282. if (m_before_exec_callback) {
  283. m_before_exec_callback(opr, tensors_cur);
  284. }
  285. //! init weights
  286. m_proxy->init(opr, tensors_cur);
  287. //! run
  288. //! warm up
  289. m_proxy->exec(opr, tensors_cur);
  290. megcoreSynchronize(m_handle->megcore_computing_handle());
  291. if (m_adaptive_secs) {
  292. //! find m_times for adaptive benchmarking
  293. m_times = 0;
  294. int cur_times = 1;
  295. auto remain_time = m_adaptive_secs * 1e6;
  296. while (remain_time > 0) {
  297. m_timer.reset();
  298. m_timer.start();
  299. for (int i = 0; i < cur_times; ++i)
  300. m_proxy->exec(opr, tensors_cur);
  301. megcoreSynchronize(m_handle->megcore_computing_handle());
  302. m_timer.stop();
  303. m_times += cur_times;
  304. auto this_run_time = m_timer.get_time_in_us();
  305. remain_time -= this_run_time;
  306. cur_times = std::min(
  307. cur_times * 2,
  308. std::max<int>(1, remain_time / this_run_time * cur_times));
  309. }
  310. }
  311. m_timer.reset();
  312. m_timer.start();
  313. for (size_t t = 0; t < m_times; ++t)
  314. m_proxy->exec(opr, tensors_cur);
  315. megcoreSynchronize(m_handle->megcore_computing_handle());
  316. m_timer.stop();
  317. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  318. if (m_display) {
  319. std::cout << "Total time is " << time_in_ms << "ms "
  320. << "for " << m_times << " run(s)." << std::endl;
  321. }
  322. auto free = [](Handle* handle, TensorNDArray& tensors) {
  323. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  324. megdnn_free(
  325. handle, static_cast<dt_byte*>(tensor.raw_ptr()) +
  326. tensor.layout.span().low_byte);
  327. });
  328. };
  329. free(m_handle, tensors_cur);
  330. if (m_adaptive_secs)
  331. time_in_ms /= m_times;
  332. return time_in_ms;
  333. }
  334. template <typename Opr, typename T = Timer, typename Proxy = OprProxy<Opr>>
  335. class Benchmarker : public BenchmarkerBase<Opr, T, Proxy> {
  336. public:
  337. Benchmarker(Handle* handle) : BenchmarkerBase<Opr, T, Proxy>{handle, Timer{}} {}
  338. };
  339. ////////////////// Algo Benchmark ////////////////////////
  340. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  341. float algo_benchmark(
  342. Benchmarker<Opr, T, Proxy>& benchmark, TensorLayoutArray layouts,
  343. const std::string& algo_base) {
  344. Proxy proxy;
  345. auto opr = benchmark.opr();
  346. opr->param() = benchmark.param();
  347. proxy.deduce_layout(opr, layouts);
  348. auto algos = OprAlgoProxy<Opr>::get_all_algorithms_info_safe(opr, layouts);
  349. float min_used = std::numeric_limits<float>::max();
  350. bool execed = false;
  351. for (auto i : algos) {
  352. if (std::regex_match(i.desc.name, std::regex("(" + algo_base + ")(.*)"))) {
  353. opr->execution_policy().algo = i.desc;
  354. auto used = benchmark.exec(layouts);
  355. min_used = std::min(min_used, used);
  356. printf("run algo: %s used: %f ms min_used: %f ms\n", i.desc.name.c_str(),
  357. used, min_used);
  358. execed = true;
  359. }
  360. }
  361. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  362. return min_used;
  363. }
  364. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  365. float algo_benchmark(
  366. Benchmarker<Opr, T, Proxy>& benchmark, TensorShapeArray shapes,
  367. const std::string& algo_base) {
  368. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  369. }
  370. } // namespace test
  371. } // namespace megdnn
  372. // vim: syntax=cpp.doxygen