You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo_chooser.cpp 15 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355
  1. /**
  2. * \file src/opr/test/algo_chooser.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "megbrain/opr/blas.h"
  13. #include "megbrain/opr/dnn/convolution.h"
  14. #include "megbrain/test/autocheck.h"
  15. #include "megbrain/test/helper.h"
  16. #include "megbrain/test/megdnn_helper.h"
  17. #include "megbrain/serialization/serializer.h"
  18. #include "megbrain/opr/basic_arith.h"
  19. #include "megbrain/gopt/inference.h"
  20. #include "megbrain/opr/tensor_manip.h"
  21. #include "megdnn/oprs/base.h"
  22. #include "megdnn/dtype.h"
  23. #include "megdnn/heuristic_cache.h"
  24. #include <cmath>
  25. #include <random>
  26. #include <utility>
  27. using namespace mgb;
  28. namespace {
  29. #if MGB_CUDA
  30. #if MGB_ENABLE_FASTRUN
  31. template <typename MgbOpr, int arith>
  32. struct GraphMaker;
  33. template <typename MgbOpr>
  34. struct GraphMaker<MgbOpr, 2> {
  35. SymbolVar operator()(const std::array<cg::SymbolVar, 2>& inputs,
  36. typename MgbOpr::Param& param,
  37. typename MgbOpr::ExecutionPolicy& policy) {
  38. return MgbOpr::make(inputs[0], inputs[1], param, policy);
  39. }
  40. };
  41. template <>
  42. struct GraphMaker<opr::ConvolutionBackwardData, 2> {
  43. SymbolVar operator()(
  44. const std::array<cg::SymbolVar, 2>& inputs,
  45. opr::ConvolutionBackwardData::Param& param,
  46. opr::ConvolutionBackwardData::ExecutionPolicy& policy) {
  47. return opr::ConvolutionBackwardData::make_deconv(inputs[0], inputs[1],
  48. param, policy);
  49. }
  50. };
  51. template <>
  52. struct GraphMaker<opr::Convolution3DBackwardData, 2> {
  53. SymbolVar operator()(
  54. const std::array<cg::SymbolVar, 2>& inputs,
  55. opr::Convolution3DBackwardData::Param& param,
  56. opr::Convolution3DBackwardData::ExecutionPolicy& policy) {
  57. return opr::Convolution3DBackwardData::make_deconv(inputs[0], inputs[1],
  58. param, policy);
  59. }
  60. };
  61. template <typename MgbOpr>
  62. struct GraphMaker<MgbOpr, 3> {
  63. SymbolVar operator()(const std::array<cg::SymbolVar, 3>& inputs,
  64. typename MgbOpr::Param& param,
  65. typename MgbOpr::ExecutionPolicy& policy) {
  66. return MgbOpr::make(inputs[0], inputs[1], inputs[2], param, policy, {});
  67. }
  68. };
  69. template <typename MgbOpr>
  70. struct GraphMaker<MgbOpr, 4> {
  71. SymbolVar operator()(const std::array<cg::SymbolVar, 4>& inputs,
  72. typename MgbOpr::Param& param,
  73. typename MgbOpr::ExecutionPolicy& policy) {
  74. return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
  75. policy, {});
  76. }
  77. };
  78. template <typename MgbOpr>
  79. struct GraphMaker<MgbOpr, 5> {
  80. SymbolVar operator()(const std::array<cg::SymbolVar, 5>& inputs,
  81. typename MgbOpr::Param& param,
  82. typename MgbOpr::ExecutionPolicy& policy) {
  83. return MgbOpr::make(inputs[0], inputs[1], inputs[2], inputs[3],
  84. inputs[4], param, policy, {});
  85. }
  86. };
  87. template <typename MgbOpr, int arith, typename dtype = dtype::Float32>
  88. void test_fastrun_opr(std::array<TensorShape, arith> inps0,
  89. std::array<TensorShape, arith> inps1,
  90. size_t expect_nr_cache_set_inp0 = 0,
  91. size_t expect_nr_cache_set_inp1 = 0,
  92. typename MgbOpr::Param param = {}) {
  93. using Policy = opr::Convolution::ExecutionPolicy;
  94. using S = Policy::Strategy;
  95. using InputGenerator = std::function<void(HostTensorND & dest)>;
  96. using ShapeInpArray = std::array<TensorShape, arith>;
  97. using CacheMem = std::pair<const void*, size_t>;
  98. auto on_get = [](const std::string&, const void*, size_t, const void*,
  99. size_t) {};
  100. std::vector<std::pair<CacheMem, CacheMem>> cache_set_history;
  101. auto on_set = [&cache_set_history](const std::string&, const void* key,
  102. size_t key_size, const void* val,
  103. size_t val_size) {
  104. cache_set_history.emplace_back(std::make_pair(key, key_size),
  105. std::make_pair(val, val_size));
  106. };
  107. PersistentCacheHook cache_hook{on_get, on_set};
  108. CompNode comp_node = CompNode::load("xpu0");
  109. GraphMaker<MgbOpr, arith> graph_maker;
  110. auto run = [&param, &comp_node, &graph_maker](
  111. const std::shared_ptr<cg::ComputingGraph>& graph,
  112. const ShapeInpArray& shapes) {
  113. std::array<InputGenerator, arith> inputs_generator;
  114. std::array<std::shared_ptr<HostTensorND>, arith> inputs;
  115. for (size_t i = 0; i < arith; ++i) {
  116. inputs[i] = std::make_shared<HostTensorND>(comp_node,
  117. dtype());
  118. }
  119. HostTensorGenerator<dtype> gen_host;
  120. for (size_t i = 0; i < arith; ++i) {
  121. inputs[i]->resize(shapes[i]);
  122. *inputs[i] = *gen_host(inputs[i]->shape(), comp_node);
  123. mgb_assert(inputs[i]->shape().eq_shape(shapes[i]));
  124. }
  125. std::array<cg::SymbolVar, arith> sym_in;
  126. for (size_t i = 0; i < arith; ++i) {
  127. // to trigger graph trans
  128. sym_in[i] = opr::Host2DeviceCopy::make(*graph, inputs[i],
  129. ssprintf("inp%zu", i));
  130. }
  131. Policy policy;
  132. policy.strategy = S::PROFILE;
  133. auto out = graph_maker(sym_in, param, policy);
  134. std::unique_ptr<cg::AsyncExecutable> func =
  135. graph->compile({{out, {}}});
  136. func->execute();
  137. };
  138. std::shared_ptr<cg::ComputingGraph> fastrun_ignore_batchsize_graph =
  139. ComputingGraph::make();
  140. fastrun_ignore_batchsize_graph->options()
  141. .fast_run_config.shared_batch_size = 20;
  142. run(fastrun_ignore_batchsize_graph, inps0);
  143. size_t nr_set_inp0 = cache_set_history.size();
  144. if (expect_nr_cache_set_inp0) {
  145. ASSERT_EQ(cache_set_history.size(), expect_nr_cache_set_inp0);
  146. }
  147. run(fastrun_ignore_batchsize_graph, inps1);
  148. size_t nr_set_total = expect_nr_cache_set_inp1 + nr_set_inp0;
  149. ASSERT_EQ(cache_set_history.size(), nr_set_total);
  150. }
  151. TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution) {
  152. REQUIRE_GPU(1);
  153. test_fastrun_opr<opr::Convolution, 2>(
  154. {TensorShape{12, 3, 36, 36}, TensorShape{4, 3, 3, 3}},
  155. {TensorShape{1, 3, 36, 36}, TensorShape{4, 3, 3, 3}});
  156. test_fastrun_opr<opr::ConvolutionBackwardData, 2>(
  157. {TensorShape{12, 4, 23, 29}, TensorShape{4, 5, 3, 2}},
  158. {TensorShape{2, 4, 23, 29}, TensorShape{4, 5, 3, 2}});
  159. test_fastrun_opr<opr::ConvolutionBackwardFilter, 3>(
  160. {TensorShape{12, 4, 23, 29}, TensorShape{12, 5, 21, 28},
  161. TensorShape{5, 4, 3, 2}},
  162. {TensorShape{2, 4, 23, 29}, TensorShape{2, 5, 21, 28},
  163. TensorShape{5, 4, 3, 2}});
  164. }
  165. TEST(TestOprDNN, FastrunIgnoreBatchSizeConvBias) {
  166. REQUIRE_GPU(1);
  167. test_fastrun_opr<opr::ConvBias, 3>(
  168. {TensorShape{20, 16, 50, 50}, TensorShape{24, 16, 3, 3},
  169. TensorShape{1, 24, 1, 1}},
  170. {TensorShape{1, 16, 50, 50}, TensorShape{24, 16, 3, 3},
  171. TensorShape{1, 24, 1, 1}});
  172. }
  173. TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution3D) {
  174. REQUIRE_GPU(1);
  175. test_fastrun_opr<opr::Convolution3D, 2>(
  176. {TensorShape{8, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}},
  177. {TensorShape{3, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}});
  178. test_fastrun_opr<opr::Convolution3DBackwardData, 2>(
  179. {TensorShape{14, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}},
  180. {TensorShape{4, 5, 12, 12, 16}, TensorShape{5, 5, 3, 3, 3}});
  181. test_fastrun_opr<opr::Convolution3DBackwardFilter, 3>(
  182. {TensorShape{64, 16, 18, 18, 18}, TensorShape{64, 16, 18, 18, 18},
  183. TensorShape{16, 16, 1, 1, 1}},
  184. {TensorShape{4, 16, 18, 18, 18}, TensorShape{4, 16, 18, 18, 18},
  185. TensorShape{16, 16, 1, 1, 1}});
  186. }
  187. TEST(TestOprDNN, FastrunIgnoreBatchSizeLocalShare) {
  188. REQUIRE_GPU(1);
  189. opr::LocalShare::Param local_share_param;
  190. local_share_param.mode = opr::LocalShare::Param::Mode::CROSS_CORRELATION;
  191. local_share_param.pad_h = local_share_param.pad_w = 1;
  192. local_share_param.stride_h = local_share_param.stride_w = 1;
  193. local_share_param.spatial_groups_h = local_share_param.spatial_groups_w = 2;
  194. test_fastrun_opr<opr::LocalShareForward, 2>(
  195. {TensorShape{32, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}},
  196. {TensorShape{3, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, 0, 0,
  197. local_share_param);
  198. test_fastrun_opr<opr::LocalShareBackwardData, 3>(
  199. {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{32, 128, 24, 24},
  200. TensorShape{32, 128, 24, 24}},
  201. {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{2, 128, 24, 24},
  202. TensorShape{2, 128, 24, 24}});
  203. test_fastrun_opr<opr::LocalShareBackwardFilter, 3>(
  204. {TensorShape{12, 3, 36, 36}, TensorShape{12, 4, 35, 35},
  205. TensorShape{3, 3, 3, 3, 3, 4}},
  206. {TensorShape{4, 3, 36, 36}, TensorShape{4, 4, 35, 35},
  207. TensorShape{3, 3, 3, 3, 3, 4}});
  208. }
  209. TEST(TestOprDNN, FastrunIgnoreBatchSizeDeformableConv) {
  210. REQUIRE_GPU(1);
  211. test_fastrun_opr<opr::DeformableConvForward, 4>(
  212. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  213. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}},
  214. {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  215. TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}});
  216. test_fastrun_opr<opr::DeformableConvBackwardData, 5>(
  217. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  218. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
  219. TensorShape{12, 6, 18, 18}},
  220. {TensorShape{4, 6, 20, 20},
  221. TensorShape{6, 6, 3, 3},
  222. TensorShape{4, 18, 18, 18},
  223. TensorShape{4, 9, 18, 18},
  224. TensorShape{4, 6, 18, 18}});
  225. test_fastrun_opr<opr::DeformableConvBackwardFilter, 5>(
  226. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  227. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
  228. TensorShape{12, 6, 18, 18}},
  229. {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  230. TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18},
  231. TensorShape{4, 6, 18, 18}});
  232. }
  233. TEST(TestOprDNN, FastrunIgnoreBatchSizeMatrixMul) {
  234. REQUIRE_GPU(1);
  235. //! fastrun_shared_batch_size == 20
  236. //! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin
  237. //! {12(10), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA
  238. //! {12(10), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB
  239. //! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB
  240. //!
  241. //! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin duplicate
  242. //! {12(4), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA
  243. //! {12(4), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB
  244. //! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB duplicate
  245. test_fastrun_opr<opr::MatrixMul, 2>(
  246. {TensorShape{10, 12}, TensorShape{12, 12}},
  247. {TensorShape{4, 12}, TensorShape{12, 12}}, 4, 2);
  248. }
  249. TEST(TestOprDNN, FastrunIgnoreBatchSizeBatchedMatrixMul) {
  250. REQUIRE_GPU(1);
  251. //! fastrun_shared_batch_size == 20
  252. //! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin
  253. //! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA
  254. //! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB
  255. //! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB
  256. //!
  257. //! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin duplicate
  258. //! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA duplicate
  259. //! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA, transB duplicate
  260. //! {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB duplicate
  261. test_fastrun_opr<opr::BatchedMatrixMul, 2>(
  262. {TensorShape{12, 6, 8}, TensorShape{12, 8, 4}},
  263. {TensorShape{4, 6, 8}, TensorShape{4, 8, 4}});
  264. }
  265. template <typename MgbOpr>
  266. void test_no_profiling_on_shape_change(const TensorShapeArray& inps0,
  267. const TensorShapeArray& inps1) {
  268. using Policy = typename MgbOpr::ExecutionPolicy;
  269. int nr_set = 0;
  270. auto on_get = [](const std::string&, const void*, size_t, const void*,
  271. size_t) {};
  272. auto on_set = [&nr_set](const std::string&, const void*, size_t,
  273. const void*, size_t) { nr_set++; };
  274. PersistentCacheHook cache_hook{on_get, on_set};
  275. auto cn = CompNode::load("xpu0");
  276. auto run = [&cn](const TensorShapeArray& shapes) {
  277. auto graph = ComputingGraph::make();
  278. graph->options().no_profiling_on_shape_change = true;
  279. HostTensorGenerator<> gen;
  280. auto host_a = gen(shapes[0], cn);
  281. auto host_b = gen(shapes[1], cn);
  282. HostTensorND host_out;
  283. auto a = opr::Host2DeviceCopy::make(*graph, host_a),
  284. b = opr::Host2DeviceCopy::make(*graph, host_b);
  285. Policy policy;
  286. policy.strategy = Policy::Strategy::PROFILE;
  287. auto out = MgbOpr::make(a, b, {}, policy, {});
  288. std::unique_ptr<cg::AsyncExecutable> func = graph->compile({{out, {}}});
  289. func->execute();
  290. };
  291. run(inps0);
  292. int nr = nr_set;
  293. ASSERT_GT(nr, 0);
  294. run(inps1);
  295. ASSERT_EQ(nr, nr_set);
  296. }
  297. TEST(TestOprDNN, FastrunNoProfilingOnShapeChange) {
  298. REQUIRE_GPU(1);
  299. megdnn::HeuristicCache::instance().clear();
  300. test_no_profiling_on_shape_change<opr::Convolution>(
  301. {{12, 3, 36, 36}, {4, 3, 3, 3}}, {{32, 3, 28, 28}, {4, 3, 3, 3}});
  302. test_no_profiling_on_shape_change<opr::MatrixMul>({{20, 30}, {30, 40}},
  303. {{30, 40}, {40, 60}});
  304. }
  305. #endif // MGB_ENABLE_FASTRUN
  306. #endif // MGB_CUDA
  307. } // anonymous namespace
  308. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台