You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

benchmarker.h 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392
  1. #pragma once
  2. #include <map>
  3. #include <memory>
  4. #include <regex>
  5. #include <vector>
  6. #include "megdnn/basic_types.h"
  7. #include "megdnn/tensor_format.h"
  8. #include "test/common/opr_algo_proxy.h"
  9. #include "test/common/opr_proxy.h"
  10. #include "test/common/rng.h"
  11. #include "test/common/timer.h"
  12. namespace megdnn {
  13. namespace test {
  14. template <typename Opr, typename T>
  15. class BenchmarkerBase {
  16. public:
  17. using Param = typename Opr::Param;
  18. using TensorValueArray = TensorNDArray;
  19. using BeforeExecCallback = std::function<void(Opr*, const TensorValueArray&)>;
  20. using TensorsConstriant = std::function<void(TensorValueArray& tensors)>;
  21. BenchmarkerBase(Handle* handle, T timer)
  22. : m_timer(timer),
  23. m_handle_naive(create_cpu_handle(2, false)),
  24. m_handle(handle),
  25. m_default_rng(new NormalRNG()),
  26. m_param(Param()),
  27. m_proxy{new OprProxy<Opr>()} {}
  28. const Handle* handle() const { return m_handle; }
  29. /*!
  30. * \brief benchmark opr on current param/dtype/rng config
  31. * \returns elapsed time in ms
  32. *
  33. * Benchmarker would construct TensorLayout vectors from shapes and
  34. * dtypes and call exec(TensorLayoutArray &).
  35. */
  36. float exec(const TensorShapeArray& shapes) { return exec(make_layouts(shapes)); }
  37. float exec(TensorLayoutArray layouts);
  38. float exect(const TensorValueArray& testcase_in);
  39. //! disabiguate overloaded exec
  40. float execs(const TensorShapeArray& shapes) { return exec(shapes); }
  41. float execl(const TensorLayoutArray& layouts) { return exec(layouts); }
  42. BenchmarkerBase& set_param(Param param) {
  43. m_param = param;
  44. return *this;
  45. }
  46. BenchmarkerBase& set_dtype(size_t idx, DType dtype) {
  47. m_dtype[idx] = dtype;
  48. return *this;
  49. }
  50. BenchmarkerBase& set_rng(size_t idx, RNG* rng) {
  51. m_rng[idx] = rng;
  52. return *this;
  53. }
  54. BenchmarkerBase& set_fmt(size_t idx, TensorFormat fmt) {
  55. m_fmt[idx] = fmt;
  56. return *this;
  57. }
  58. BenchmarkerBase& set_tensors_constraint(
  59. const TensorsConstriant& tensor_constraint) {
  60. m_tensor_constraint = tensor_constraint;
  61. return *this;
  62. }
  63. TensorLayoutArray make_layouts(const TensorShapeArray& shapes) {
  64. TensorLayoutArray layouts(shapes.size());
  65. for (size_t i = 0; i < shapes.size(); ++i) {
  66. DType dt =
  67. (m_dtype.find(i) != m_dtype.end() ? m_dtype[i] : dtype::Float32());
  68. if (m_fmt.find(i) == m_fmt.end()) {
  69. layouts[i] = TensorLayout(shapes[i], dt);
  70. layouts[i].init_contiguous_stride();
  71. } else
  72. layouts[i] = TensorLayout(shapes[i], dt, m_fmt[i]);
  73. }
  74. return layouts;
  75. }
  76. BenchmarkerBase& set_proxy(std::unique_ptr<OprProxy<Opr>>& proxy) {
  77. m_proxy.reset(nullptr);
  78. m_proxy = std::move(proxy);
  79. return *this;
  80. }
  81. std::unique_ptr<OprProxy<Opr>>& proxy() { return m_proxy; }
  82. BenchmarkerBase& set_times(size_t times) {
  83. m_times = times;
  84. return *this;
  85. }
  86. BenchmarkerBase& set_display(bool display) {
  87. m_display = display;
  88. return *this;
  89. }
  90. //! set a callback to be invoked before executing the operator
  91. BenchmarkerBase& set_before_exec_callback(const BeforeExecCallback& cb) {
  92. m_before_exec_callback = cb;
  93. return *this;
  94. }
  95. /*!
  96. * \brief set adaptive benchmarking: ignore set_times() and find
  97. * suitable times to run for given duration;
  98. *
  99. * Note: the value returned by exec() would be average time per run,
  100. * rather than total elapsed time, if this is enabled.
  101. */
  102. BenchmarkerBase& set_adaptive_benchmark(float tot_time_in_secs) {
  103. m_adaptive_secs = tot_time_in_secs;
  104. return *this;
  105. }
  106. //! get the opr impl so setting other than param() can be modified
  107. Opr* opr() {
  108. if (!m_opr) {
  109. m_opr = m_handle->create_operator<Opr>();
  110. }
  111. return m_opr.get();
  112. }
  113. const Param& param() const { return m_param; }
  114. private:
  115. T m_timer;
  116. bool m_display = true;
  117. size_t m_times = 1;
  118. float m_adaptive_secs = 0;
  119. std::unique_ptr<Handle> m_handle_naive;
  120. Handle* m_handle;
  121. std::unique_ptr<RNG> m_default_rng;
  122. std::map<size_t, RNG*> m_rng;
  123. std::map<size_t, DType> m_dtype;
  124. std::map<size_t, TensorFormat> m_fmt;
  125. Param m_param;
  126. std::unique_ptr<OprProxy<Opr>> m_proxy;
  127. BeforeExecCallback m_before_exec_callback;
  128. std::unique_ptr<Opr> m_opr;
  129. TensorsConstriant m_tensor_constraint;
  130. };
  131. template <typename Opr, typename T>
  132. float BenchmarkerBase<Opr, T>::exec(TensorLayoutArray layouts) {
  133. auto opr = this->opr();
  134. opr->param() = m_param;
  135. auto user_layouts = layouts;
  136. m_proxy->deduce_layout(opr, layouts);
  137. for (size_t i = 0; i < layouts.size(); ++i) {
  138. if (user_layouts[i].ndim > 0) {
  139. auto run = [&]() {
  140. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  141. << "User provided shape is "
  142. << user_layouts[i].TensorShape::to_string()
  143. << "\nExpected shape is "
  144. << layouts[i].TensorShape::to_string();
  145. };
  146. run();
  147. }
  148. }
  149. auto allocate = [&layouts](Handle* handle) {
  150. TensorNDArray tensors(layouts.size());
  151. auto trans_func = [handle](const TensorLayout& layout) {
  152. auto span = layout.span();
  153. TensorND res;
  154. res.reset_ptr(
  155. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) -
  156. span.low_byte);
  157. res.layout = layout;
  158. return res;
  159. };
  160. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  161. return tensors;
  162. };
  163. auto tensors_cur = allocate(m_handle);
  164. auto tensors_cur_host = allocate(m_handle_naive.get());
  165. // init
  166. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  167. TensorND& tensor = tensors_cur_host[i];
  168. auto rng = m_rng[i];
  169. if (!rng)
  170. rng = m_default_rng.get();
  171. rng->gen(tensor);
  172. }
  173. if (m_tensor_constraint) {
  174. m_tensor_constraint(tensors_cur_host);
  175. }
  176. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  177. TensorND& tensor = tensors_cur_host[i];
  178. if (tensor.layout.ndim == 0)
  179. continue;
  180. auto size = tensor.layout.span().high_byte;
  181. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  182. }
  183. if (m_before_exec_callback) {
  184. m_before_exec_callback(opr, tensors_cur);
  185. }
  186. // run
  187. // warm up
  188. m_proxy->exec(opr, tensors_cur);
  189. megcoreSynchronize(m_handle->megcore_computing_handle());
  190. if (m_adaptive_secs) {
  191. // find m_times for adaptive benchmarking
  192. m_times = 0;
  193. int cur_times = 1;
  194. auto remain_time = m_adaptive_secs * 1e6;
  195. while (remain_time > 0) {
  196. m_timer.reset();
  197. m_timer.start();
  198. for (int i = 0; i < cur_times; ++i)
  199. m_proxy->exec(opr, tensors_cur);
  200. megcoreSynchronize(m_handle->megcore_computing_handle());
  201. m_timer.stop();
  202. m_times += cur_times;
  203. auto this_run_time = m_timer.get_time_in_us();
  204. remain_time -= this_run_time;
  205. cur_times = std::min(
  206. cur_times * 2,
  207. std::max<int>(1, remain_time / this_run_time * cur_times));
  208. }
  209. }
  210. m_timer.reset();
  211. m_timer.start();
  212. for (size_t t = 0; t < m_times; ++t)
  213. m_proxy->exec(opr, tensors_cur);
  214. megcoreSynchronize(m_handle->megcore_computing_handle());
  215. m_timer.stop();
  216. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  217. if (m_display) {
  218. std::cout << "Total time is " << time_in_ms << "ms "
  219. << "for " << m_times << " run(s)." << std::endl;
  220. }
  221. auto free = [](Handle* handle, TensorNDArray& tensors) {
  222. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  223. megdnn_free(
  224. handle, static_cast<dt_byte*>(tensor.raw_ptr()) +
  225. tensor.layout.span().low_byte);
  226. });
  227. };
  228. free(m_handle, tensors_cur);
  229. free(m_handle_naive.get(), tensors_cur_host);
  230. if (m_adaptive_secs)
  231. time_in_ms /= m_times;
  232. return time_in_ms;
  233. }
  234. template <typename Opr, typename T>
  235. float BenchmarkerBase<Opr, T>::exect(const TensorValueArray& testcase_in) {
  236. auto opr = this->opr();
  237. opr->param() = m_param;
  238. TensorLayoutArray layouts;
  239. TensorNDArray tensors_cur_host;
  240. for (auto& inp : testcase_in) {
  241. layouts.push_back(inp.layout);
  242. tensors_cur_host.emplace_back(inp);
  243. }
  244. auto user_layouts = layouts;
  245. m_proxy->deduce_layout(opr, layouts);
  246. for (size_t i = 0; i < layouts.size(); ++i)
  247. if (user_layouts[i].ndim > 0) {
  248. auto run = [&]() {
  249. ASSERT_TRUE(layouts[i].eq_shape(user_layouts[i]))
  250. << "User provided shape is "
  251. << user_layouts[i].TensorShape::to_string()
  252. << "\nExpected shape is "
  253. << layouts[i].TensorShape::to_string();
  254. };
  255. run();
  256. }
  257. auto allocate = [&layouts](Handle* handle) {
  258. TensorNDArray tensors(layouts.size());
  259. auto trans_func = [handle](const TensorLayout& layout) {
  260. auto span = layout.span();
  261. TensorND res;
  262. res.reset_ptr(
  263. static_cast<uint8_t*>(megdnn_malloc(handle, span.dist_byte())) -
  264. span.low_byte);
  265. res.layout = layout;
  266. return res;
  267. };
  268. std::transform(layouts.begin(), layouts.end(), tensors.begin(), trans_func);
  269. return tensors;
  270. };
  271. auto tensors_cur = allocate(m_handle);
  272. //! init
  273. for (size_t i = 0; i < tensors_cur_host.size(); ++i) {
  274. TensorND& tensor = tensors_cur_host[i];
  275. auto size = tensor.layout.span().high_byte;
  276. if (tensor.layout.ndim == 0)
  277. continue;
  278. megdnn_memcpy_H2D(m_handle, tensors_cur[i].raw_ptr(), tensor.raw_ptr(), size);
  279. }
  280. if (m_before_exec_callback) {
  281. m_before_exec_callback(opr, tensors_cur);
  282. }
  283. //! run
  284. //! warm up
  285. m_proxy->exec(opr, tensors_cur);
  286. megcoreSynchronize(m_handle->megcore_computing_handle());
  287. if (m_adaptive_secs) {
  288. //! find m_times for adaptive benchmarking
  289. m_times = 0;
  290. int cur_times = 1;
  291. auto remain_time = m_adaptive_secs * 1e6;
  292. while (remain_time > 0) {
  293. m_timer.reset();
  294. m_timer.start();
  295. for (int i = 0; i < cur_times; ++i)
  296. m_proxy->exec(opr, tensors_cur);
  297. megcoreSynchronize(m_handle->megcore_computing_handle());
  298. m_timer.stop();
  299. m_times += cur_times;
  300. auto this_run_time = m_timer.get_time_in_us();
  301. remain_time -= this_run_time;
  302. cur_times = std::min(
  303. cur_times * 2,
  304. std::max<int>(1, remain_time / this_run_time * cur_times));
  305. }
  306. }
  307. m_timer.reset();
  308. m_timer.start();
  309. for (size_t t = 0; t < m_times; ++t)
  310. m_proxy->exec(opr, tensors_cur);
  311. megcoreSynchronize(m_handle->megcore_computing_handle());
  312. m_timer.stop();
  313. auto time_in_ms = m_timer.get_time_in_us() / 1e3;
  314. if (m_display) {
  315. std::cout << "Total time is " << time_in_ms << "ms "
  316. << "for " << m_times << " run(s)." << std::endl;
  317. }
  318. auto free = [](Handle* handle, TensorNDArray& tensors) {
  319. std::for_each(tensors.begin(), tensors.end(), [handle](const TensorND& tensor) {
  320. megdnn_free(
  321. handle, static_cast<dt_byte*>(tensor.raw_ptr()) +
  322. tensor.layout.span().low_byte);
  323. });
  324. };
  325. free(m_handle, tensors_cur);
  326. if (m_adaptive_secs)
  327. time_in_ms /= m_times;
  328. return time_in_ms;
  329. }
  330. template <typename Opr, typename T = Timer>
  331. class Benchmarker;
  332. template <typename Opr>
  333. class Benchmarker<Opr, Timer> : public BenchmarkerBase<Opr, Timer> {
  334. public:
  335. Benchmarker(Handle* handle) : BenchmarkerBase<Opr, Timer>{handle, Timer{}} {}
  336. };
  337. ////////////////// Algo Benchmark ////////////////////////
  338. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  339. float algo_benchmark(
  340. Benchmarker<Opr, T>& benchmark, TensorLayoutArray layouts,
  341. const std::string& algo_base) {
  342. Proxy proxy;
  343. auto opr = benchmark.opr();
  344. opr->param() = benchmark.param();
  345. proxy.deduce_layout(opr, layouts);
  346. auto algos = OprAlgoProxy<Opr>::get_all_algorithms_info_safe(opr, layouts);
  347. float min_used = std::numeric_limits<float>::max();
  348. bool execed = false;
  349. for (auto i : algos) {
  350. if (std::regex_match(i.desc.name, std::regex("(" + algo_base + ")(.*)"))) {
  351. opr->execution_policy().algo = i.desc;
  352. auto used = benchmark.exec(layouts);
  353. min_used = std::min(min_used, used);
  354. printf("run algo: %s used: %f ms min_used: %f ms\n", i.desc.name.c_str(),
  355. used, min_used);
  356. execed = true;
  357. }
  358. }
  359. megdnn_assert(execed, "no algo start with %s\n", algo_base.c_str());
  360. return min_used;
  361. }
  362. template <typename Opr, typename Proxy = OprProxy<Opr>, typename T = Timer>
  363. float algo_benchmark(
  364. Benchmarker<Opr, T>& benchmark, TensorShapeArray shapes,
  365. const std::string& algo_base) {
  366. return algo_benchmark(benchmark, benchmark.make_layouts(shapes), algo_base);
  367. }
  368. } // namespace test
  369. } // namespace megdnn
  370. // vim: syntax=cpp.doxygen