You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

algo_chooser.cpp 17 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409
  1. /**
  2. * \file src/opr/test/algo_chooser.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "megbrain/gopt/inference.h"
  13. #include "megbrain/opr/basic_arith.h"
  14. #include "megbrain/opr/blas.h"
  15. #include "megbrain/opr/dnn/convolution.h"
  16. #include "megbrain/opr/dnn/pooling.h"
  17. #include "megbrain/opr/tensor_manip.h"
  18. #include "megbrain/serialization/opr_shallow_copy.h"
  19. #include "megbrain/serialization/serializer.h"
  20. #include "megbrain/test/autocheck.h"
  21. #include "megbrain/test/helper.h"
  22. #include "megbrain/test/megdnn_helper.h"
  23. #include "megdnn/dtype.h"
  24. #include "megdnn/heuristic_cache.h"
  25. #include "megdnn/oprs/base.h"
  26. #include <cmath>
  27. #include <random>
  28. #include <utility>
  29. using namespace mgb;
  30. namespace {
  31. template <typename MgbOpr, int arith>
  32. struct GraphMaker;
  33. template <>
  34. struct GraphMaker<opr::Pooling, 1> {
  35. SymbolVar operator()(
  36. const std::array<cg::SymbolVar, 1>& inputs, opr::Pooling::Param& param,
  37. opr::Pooling::ExecutionPolicy& policy) {
  38. return opr::Pooling::make(inputs[0], param, policy);
  39. }
  40. };
  41. template <typename MgbOpr>
  42. struct GraphMaker<MgbOpr, 2> {
  43. SymbolVar operator()(
  44. const std::array<cg::SymbolVar, 2>& inputs, typename MgbOpr::Param& param,
  45. typename MgbOpr::ExecutionPolicy& policy) {
  46. return MgbOpr::make(inputs[0], inputs[1], param, policy);
  47. }
  48. };
  49. template <typename MgbOpr>
  50. struct GraphMaker<MgbOpr, 3> {
  51. SymbolVar operator()(
  52. const std::array<cg::SymbolVar, 3>& inputs, typename MgbOpr::Param& param,
  53. typename MgbOpr::ExecutionPolicy& policy) {
  54. return MgbOpr::make(inputs[0], inputs[1], inputs[2], param, policy, {});
  55. }
  56. };
  57. template <typename MgbOpr>
  58. struct GraphMaker<MgbOpr, 4> {
  59. SymbolVar operator()(
  60. const std::array<cg::SymbolVar, 4>& inputs, typename MgbOpr::Param& param,
  61. typename MgbOpr::ExecutionPolicy& policy) {
  62. return MgbOpr::make(
  63. inputs[0], inputs[1], inputs[2], inputs[3], param, policy, {});
  64. }
  65. };
  66. template <typename MgbOpr>
  67. struct GraphMaker<MgbOpr, 5> {
  68. SymbolVar operator()(
  69. const std::array<cg::SymbolVar, 5>& inputs, typename MgbOpr::Param& param,
  70. typename MgbOpr::ExecutionPolicy& policy) {
  71. return MgbOpr::make(
  72. inputs[0], inputs[1], inputs[2], inputs[3], inputs[4], param, policy,
  73. {});
  74. }
  75. };
  76. template <typename MgbOpr, int arith, typename dtype = dtype::Float32>
  77. void test_execution_policy_shallow_copy(
  78. std::array<TensorShape, arith> shapes, typename MgbOpr::Param param = {}) {
  79. using Policy = typename MgbOpr::ExecutionPolicy;
  80. Policy policy;
  81. policy.strategy = Policy::Strategy::PROFILE;
  82. auto cn = CompNode::load("cpu0");
  83. auto graph0 = ComputingGraph::make(), graph1 = ComputingGraph::make();
  84. std::array<cg::SymbolVar, arith> inputs0;
  85. VarNodeArray inputs1;
  86. for (size_t i = 0; i < arith; ++i) {
  87. HostTensorND hi{cn, shapes[i], dtype()};
  88. inputs0[i] = opr::ImmutableTensor::make(*graph0, hi);
  89. inputs1.push_back(opr::ImmutableTensor::make(*graph1, hi).node());
  90. }
  91. GraphMaker<MgbOpr, arith> graph_maker;
  92. auto opr0 = graph_maker(inputs0, param, policy).node()->owner_opr();
  93. auto opr1 = serialization::copy_opr_shallow(*opr0, inputs1, OperatorNodeConfig{});
  94. auto m0 = &(opr0->template cast_final<MgbOpr>());
  95. auto m1 = &(opr1->template cast_final<MgbOpr>());
  96. ASSERT_EQ(policy.strategy, m0->execution_policy().strategy);
  97. ASSERT_EQ(policy.strategy, m1->execution_policy().strategy);
  98. }
  99. #if MGB_CUDA
  100. #if MGB_ENABLE_FASTRUN
  101. template <typename MgbOpr, int arith, typename dtype = dtype::Float32>
  102. void test_fastrun_opr(
  103. std::array<TensorShape, arith> inps0, std::array<TensorShape, arith> inps1,
  104. size_t expect_nr_cache_set_inp0 = 0, size_t expect_nr_cache_set_inp1 = 0,
  105. typename MgbOpr::Param param = {}) {
  106. using Policy = opr::Convolution::ExecutionPolicy;
  107. using S = Policy::Strategy;
  108. using InputGenerator = std::function<void(HostTensorND & dest)>;
  109. using ShapeInpArray = std::array<TensorShape, arith>;
  110. using CacheMem = std::pair<const void*, size_t>;
  111. auto on_get = [](const std::string&, const void*, size_t, const void*, size_t) {};
  112. std::vector<std::pair<CacheMem, CacheMem>> cache_set_history;
  113. auto on_set = [&cache_set_history](
  114. const std::string&, const void* key, size_t key_size,
  115. const void* val, size_t val_size) {
  116. cache_set_history.emplace_back(
  117. std::make_pair(key, key_size), std::make_pair(val, val_size));
  118. };
  119. PersistentCacheHook cache_hook{on_get, on_set};
  120. CompNode comp_node = CompNode::load("xpu0");
  121. GraphMaker<MgbOpr, arith> graph_maker;
  122. auto run = [&param, &comp_node, &graph_maker](
  123. const std::shared_ptr<cg::ComputingGraph>& graph,
  124. const ShapeInpArray& shapes) {
  125. std::array<InputGenerator, arith> inputs_generator;
  126. std::array<std::shared_ptr<HostTensorND>, arith> inputs;
  127. for (size_t i = 0; i < arith; ++i) {
  128. inputs[i] = std::make_shared<HostTensorND>(comp_node, dtype());
  129. }
  130. HostTensorGenerator<dtype> gen_host;
  131. for (size_t i = 0; i < arith; ++i) {
  132. inputs[i]->resize(shapes[i]);
  133. *inputs[i] = *gen_host(inputs[i]->shape(), comp_node);
  134. mgb_assert(inputs[i]->shape().eq_shape(shapes[i]));
  135. }
  136. std::array<cg::SymbolVar, arith> sym_in;
  137. for (size_t i = 0; i < arith; ++i) {
  138. // to trigger graph trans
  139. sym_in[i] = opr::Host2DeviceCopy::make(
  140. *graph, inputs[i], ssprintf("inp%zu", i));
  141. }
  142. Policy policy;
  143. policy.strategy = S::PROFILE;
  144. auto out = graph_maker(sym_in, param, policy);
  145. std::unique_ptr<cg::AsyncExecutable> func = graph->compile({{out, {}}});
  146. func->execute();
  147. };
  148. std::shared_ptr<cg::ComputingGraph> fastrun_ignore_batchsize_graph =
  149. ComputingGraph::make();
  150. fastrun_ignore_batchsize_graph->options().fast_run_config.shared_batch_size = 20;
  151. run(fastrun_ignore_batchsize_graph, inps0);
  152. size_t nr_set_inp0 = cache_set_history.size();
  153. if (expect_nr_cache_set_inp0) {
  154. ASSERT_EQ(cache_set_history.size(), expect_nr_cache_set_inp0);
  155. }
  156. run(fastrun_ignore_batchsize_graph, inps1);
  157. size_t nr_set_total = expect_nr_cache_set_inp1 + nr_set_inp0;
  158. ASSERT_EQ(cache_set_history.size(), nr_set_total);
  159. }
  160. #endif // MGB_ENABLE_FASTRUN
  161. #endif // MGB_CUDA
  162. } // anonymous namespace
  163. #if MGB_CUDA
  164. #if MGB_ENABLE_FASTRUN
  165. TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution) {
  166. REQUIRE_GPU(1);
  167. test_fastrun_opr<opr::Convolution, 2>(
  168. {TensorShape{12, 3, 36, 36}, TensorShape{4, 3, 3, 3}},
  169. {TensorShape{1, 3, 36, 36}, TensorShape{4, 3, 3, 3}});
  170. test_fastrun_opr<opr::ConvolutionBackwardData, 3>(
  171. {TensorShape{4, 5, 3, 2}, TensorShape{12, 4, 23, 29},
  172. TensorShape{12, 5, 25, 30}},
  173. {TensorShape{4, 5, 3, 2}, TensorShape{2, 4, 23, 29},
  174. TensorShape{2, 5, 25, 30}});
  175. test_fastrun_opr<opr::ConvolutionBackwardFilter, 3>(
  176. {TensorShape{12, 4, 23, 29}, TensorShape{12, 5, 21, 28},
  177. TensorShape{5, 4, 3, 2}},
  178. {TensorShape{2, 4, 23, 29}, TensorShape{2, 5, 21, 28},
  179. TensorShape{5, 4, 3, 2}});
  180. }
  181. TEST(TestOprDNN, FastrunIgnoreBatchSizeConvBias) {
  182. REQUIRE_GPU(1);
  183. test_fastrun_opr<opr::ConvBias, 3>(
  184. {TensorShape{20, 16, 50, 50}, TensorShape{24, 16, 3, 3},
  185. TensorShape{1, 24, 1, 1}},
  186. {TensorShape{1, 16, 50, 50}, TensorShape{24, 16, 3, 3},
  187. TensorShape{1, 24, 1, 1}});
  188. }
  189. TEST(TestOprDNN, FastrunIgnoreBatchSizeConvolution3D) {
  190. REQUIRE_GPU(1);
  191. test_fastrun_opr<opr::Convolution3D, 2>(
  192. {TensorShape{8, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}},
  193. {TensorShape{3, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}});
  194. test_fastrun_opr<opr::Convolution3DBackwardData, 3>(
  195. {TensorShape{5, 5, 3, 3, 3}, TensorShape{14, 5, 12, 12, 16},
  196. TensorShape{14, 5, 14, 14, 18}},
  197. {TensorShape{5, 5, 3, 3, 3}, TensorShape{4, 5, 12, 12, 16},
  198. TensorShape{4, 5, 14, 14, 18}});
  199. test_fastrun_opr<opr::Convolution3DBackwardFilter, 3>(
  200. {TensorShape{64, 16, 18, 18, 18}, TensorShape{64, 16, 18, 18, 18},
  201. TensorShape{16, 16, 1, 1, 1}},
  202. {TensorShape{4, 16, 18, 18, 18}, TensorShape{4, 16, 18, 18, 18},
  203. TensorShape{16, 16, 1, 1, 1}});
  204. }
  205. TEST(TestOprDNN, FastrunIgnoreBatchSizeLocalShare) {
  206. REQUIRE_GPU(1);
  207. opr::LocalShare::Param local_share_param;
  208. local_share_param.mode = opr::LocalShare::Param::Mode::CROSS_CORRELATION;
  209. local_share_param.pad_h = local_share_param.pad_w = 1;
  210. local_share_param.stride_h = local_share_param.stride_w = 1;
  211. local_share_param.spatial_groups_h = local_share_param.spatial_groups_w = 2;
  212. test_fastrun_opr<opr::LocalShareForward, 2>(
  213. {TensorShape{32, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}},
  214. {TensorShape{3, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}}, 0, 0,
  215. local_share_param);
  216. test_fastrun_opr<opr::LocalShareBackwardData, 3>(
  217. {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{32, 128, 24, 24},
  218. TensorShape{32, 128, 24, 24}},
  219. {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{2, 128, 24, 24},
  220. TensorShape{2, 128, 24, 24}});
  221. test_fastrun_opr<opr::LocalShareBackwardFilter, 3>(
  222. {TensorShape{12, 3, 36, 36}, TensorShape{12, 4, 35, 35},
  223. TensorShape{3, 3, 3, 3, 3, 4}},
  224. {TensorShape{4, 3, 36, 36}, TensorShape{4, 4, 35, 35},
  225. TensorShape{3, 3, 3, 3, 3, 4}});
  226. }
  227. TEST(TestOprDNN, FastrunIgnoreBatchSizeDeformableConv) {
  228. REQUIRE_GPU(1);
  229. test_fastrun_opr<opr::DeformableConvForward, 4>(
  230. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  231. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}},
  232. {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  233. TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18}});
  234. test_fastrun_opr<opr::DeformableConvBackwardData, 5>(
  235. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  236. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
  237. TensorShape{12, 6, 18, 18}},
  238. {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  239. TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18},
  240. TensorShape{4, 6, 18, 18}});
  241. test_fastrun_opr<opr::DeformableConvBackwardFilter, 5>(
  242. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  243. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
  244. TensorShape{12, 6, 18, 18}},
  245. {TensorShape{4, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  246. TensorShape{4, 18, 18, 18}, TensorShape{4, 9, 18, 18},
  247. TensorShape{4, 6, 18, 18}});
  248. }
  249. TEST(TestOprDNN, FastrunIgnoreBatchSizeMatrixMul) {
  250. REQUIRE_GPU(1);
  251. //! fastrun_shared_batch_size == 20
  252. //! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin
  253. //! {12(10), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA
  254. //! {12(10), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB
  255. //! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB
  256. //!
  257. //! {20(12), 12(1)}, {12(12), 20(1)} -> {20(12), 20(1)} origin duplicate
  258. //! {12(4), 20(1)}, {12(12), 20(1)} -> {20(12), 20(1)} transA
  259. //! {12(4), 20(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transA, transB
  260. //! {20(12), 12(1)}, {20(12), 12(1)} -> {20(12), 20(1)} transB duplicate
  261. test_fastrun_opr<opr::MatrixMul, 2>(
  262. {TensorShape{10, 12}, TensorShape{12, 12}},
  263. {TensorShape{4, 12}, TensorShape{12, 12}}, 4, 2);
  264. }
  265. TEST(TestOprDNN, FastrunIgnoreBatchSizeBatchedMatrixMul) {
  266. REQUIRE_GPU(1);
  267. //! fastrun_shared_batch_size == 20
  268. //! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin
  269. //! {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} transA
  270. //! {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transA,
  271. //! transB {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4), 4(1)} transB
  272. //!
  273. //! {20(48), 6(8), 8(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)} origin
  274. //! duplicate {20(48), 8(6), 6(1)}, {20(32), 8(4), 4(1)} -> {20(24), 6(4), 4(1)}
  275. //! transA duplicate {20(48), 8(6), 6(1)}, {20(32), 4(8), 8(1)} -> {20(24), 6(4),
  276. //! 4(1)} transA, transB duplicate {20(48), 6(8), 8(1)}, {20(32), 4(8), 8(1)} ->
  277. //! {20(24), 6(4), 4(1)} transB duplicate
  278. test_fastrun_opr<opr::BatchedMatrixMul, 2>(
  279. {TensorShape{12, 6, 8}, TensorShape{12, 8, 4}},
  280. {TensorShape{4, 6, 8}, TensorShape{4, 8, 4}});
  281. }
  282. #endif // MGB_ENABLE_FASTRUN
  283. #endif // MGB_CUDA
  284. TEST(TestOprDNN, ExecutionPolicyShallowCopyConvolution) {
  285. test_execution_policy_shallow_copy<opr::Convolution, 2>(
  286. {TensorShape{12, 3, 36, 36}, TensorShape{4, 3, 3, 3}});
  287. test_execution_policy_shallow_copy<opr::ConvolutionBackwardData, 3>(
  288. {TensorShape{4, 5, 3, 2}, TensorShape{12, 4, 23, 29},
  289. TensorShape{12, 5, 25, 30}});
  290. test_execution_policy_shallow_copy<opr::ConvolutionBackwardFilter, 3>(
  291. {TensorShape{12, 4, 23, 29}, TensorShape{12, 5, 21, 28},
  292. TensorShape{5, 4, 3, 2}});
  293. }
  294. TEST(TestOprDNN, ExecutionPolicyShallowCopyConvBias) {
  295. test_execution_policy_shallow_copy<opr::ConvBias, 3>(
  296. {TensorShape{20, 16, 50, 50}, TensorShape{24, 16, 3, 3},
  297. TensorShape{1, 24, 1, 1}});
  298. }
  299. TEST(TestOprDNN, ExecutionPolicyShallowCopyConvolution3D) {
  300. test_execution_policy_shallow_copy<opr::Convolution3D, 2>(
  301. {TensorShape{8, 4, 12, 13, 14}, TensorShape{4, 4, 3, 3, 3}});
  302. test_execution_policy_shallow_copy<opr::Convolution3DBackwardData, 3>(
  303. {TensorShape{5, 5, 3, 3, 3}, TensorShape{14, 5, 12, 12, 16},
  304. TensorShape{14, 5, 14, 14, 18}});
  305. test_execution_policy_shallow_copy<opr::Convolution3DBackwardFilter, 3>(
  306. {TensorShape{64, 16, 18, 18, 18}, TensorShape{64, 16, 18, 18, 18},
  307. TensorShape{16, 16, 1, 1, 1}});
  308. }
  309. TEST(TestOprDNN, ExecutionPolicyShallowCopyLocalShare) {
  310. opr::LocalShare::Param local_share_param;
  311. local_share_param.mode = opr::LocalShare::Param::Mode::CROSS_CORRELATION;
  312. local_share_param.pad_h = local_share_param.pad_w = 1;
  313. local_share_param.stride_h = local_share_param.stride_w = 1;
  314. local_share_param.spatial_groups_h = local_share_param.spatial_groups_w = 2;
  315. test_execution_policy_shallow_copy<opr::LocalShareForward, 2>(
  316. {TensorShape{32, 2, 23, 23}, TensorShape{2, 2, 2, 2, 2, 7}},
  317. local_share_param);
  318. test_execution_policy_shallow_copy<opr::LocalShareBackwardData, 3>(
  319. {TensorShape{3, 3, 128, 1, 1, 128}, TensorShape{32, 128, 24, 24},
  320. TensorShape{32, 128, 24, 24}});
  321. test_execution_policy_shallow_copy<opr::LocalShareBackwardFilter, 3>(
  322. {TensorShape{12, 3, 36, 36}, TensorShape{12, 4, 35, 35},
  323. TensorShape{3, 3, 3, 3, 3, 4}});
  324. }
  325. TEST(TestOprDNN, ExecutionPolicyShallowCopyDeformableConv) {
  326. test_execution_policy_shallow_copy<opr::DeformableConvForward, 4>(
  327. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  328. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18}});
  329. test_execution_policy_shallow_copy<opr::DeformableConvBackwardData, 5>(
  330. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  331. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
  332. TensorShape{12, 6, 18, 18}});
  333. test_execution_policy_shallow_copy<opr::DeformableConvBackwardFilter, 5>(
  334. {TensorShape{12, 6, 20, 20}, TensorShape{6, 6, 3, 3},
  335. TensorShape{12, 18, 18, 18}, TensorShape{12, 9, 18, 18},
  336. TensorShape{12, 6, 18, 18}});
  337. }
  338. TEST(TestOprDNN, ExecutionPolicyShallowCopyMatrixMul) {
  339. test_execution_policy_shallow_copy<opr::MatrixMul, 2>(
  340. {TensorShape{10, 12}, TensorShape{12, 12}});
  341. test_execution_policy_shallow_copy<opr::BatchedMatrixMul, 2>(
  342. {TensorShape{12, 6, 8}, TensorShape{12, 8, 4}});
  343. }
  344. TEST(TestOprDNN, ExecutionPolicyShallowCopyPooling) {
  345. test_execution_policy_shallow_copy<opr::Pooling, 1>({TensorShape{1, 20, 24, 24}});
  346. test_execution_policy_shallow_copy<opr::PoolingBackward, 3>(
  347. {TensorShape{1, 20, 24, 24}, TensorShape{1, 20, 12, 12},
  348. TensorShape{1, 20, 12, 12}});
  349. }
  350. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}