You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

fusion.cpp 53 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452
  1. /**
  2. * \file src/jit/test/fusion.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "./helper.h"
  12. #include "megbrain_build_config.h"
  13. #include "megbrain/gopt/framework.h"
  14. #include "megbrain/gopt/misc.h"
  15. #include "megbrain/graph/cg.h"
  16. #include "megbrain/jit/ast_c.h"
  17. #include "megbrain/jit/executor_opr.h"
  18. #include "megbrain/jit/fusion_pass.h"
  19. #include "megbrain/opr/basic_arith_wrapper.h"
  20. #include "megbrain/opr/blas.h"
  21. #include "megbrain/opr/tensor_manip.h"
  22. #include "megbrain/opr/utility.h"
  23. #include "megbrain/test/autocheck.h"
  24. #include "megbrain/test/helper.h"
  25. #include "megbrain/opr/dnn/convolution.h"
  26. #if MGB_JIT
  27. using namespace mgb;
  28. using namespace jit;
  29. #define FOREACH_CASE(cb) \
  30. cb(basic) cb(shape_change) cb(large_num_inps) cb(simple_exp) \
  31. cb(complex_exp) cb(exp_pow) cb(cache) cb(all_oprs) \
  32. cb(expand_jit_executor) cb(multi_device) cb(multi_shape) \
  33. cb(non_contig) cb(visit_complexity) cb(imm_scalar) \
  34. cb(jit_grad) cb(concat_input) cb(special_graph_input)
  35. namespace {
  36. #define def_tag(x) \
  37. struct x {};
  38. FOREACH_CASE(def_tag)
  39. #undef def_tag
  40. #define t(n) n,
  41. using test_types = ::testing::Types<FOREACH_CASE(t) void>;
  42. #undef t
  43. template <typename tag>
  44. void run(Backend backend, CompNode cn);
  45. template <typename T>
  46. size_t find_opr_num(SymbolVar endpoint) {
  47. size_t opr_num = 0;
  48. auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
  49. if (opr->same_type<T>()) {
  50. opr_num++;
  51. }
  52. };
  53. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  54. return opr_num;
  55. }
  56. template <typename T>
  57. SmallVector<T*> find_oprs(SymbolVar endpoint) {
  58. SmallVector<T*> res;
  59. auto cb = [&res](cg::OperatorNodeBase* opr) {
  60. if (opr->same_type<T>()) {
  61. auto ptr = &(opr->cast_final_safe<T>());
  62. res.push_back(ptr);
  63. }
  64. };
  65. cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
  66. return res;
  67. }
  68. template <typename T>
  69. SmallVector<T*> find_oprs(cg::AsyncExecutable& func) {
  70. SmallVector<T*> res;
  71. auto cb = [&res](cg::OperatorNodeBase* opr) {
  72. if (opr->same_type<T>()) {
  73. auto ptr = &(opr->cast_final_safe<T>());
  74. res.push_back(ptr);
  75. }
  76. return true;
  77. };
  78. func.iter_opr_seq(cb);
  79. return res;
  80. }
  81. //! make a pair of functions with and without JIT optimization
  82. std::pair<std::unique_ptr<cg::AsyncExecutable>,
  83. std::unique_ptr<cg::AsyncExecutable>>
  84. make_func_pair(HostTensorND& dst0, HostTensorND& dst1,
  85. thin_function<SymbolVar(ComputingGraph&)> make_dst,
  86. uint8_t jit_level) {
  87. auto g0 = ComputingGraph::make();
  88. g0->options().graph_opt_level = 0;
  89. auto f0 = g0->compile({make_callback_copy(make_dst(*g0), dst0)});
  90. auto g1 = ComputingGraph::make();
  91. g1->options().graph_opt_level = 3;
  92. g1->options().graph_opt.jit = jit_level;
  93. auto f1 = g1->compile({make_callback_copy(make_dst(*g1), dst1)});
  94. EXPECT_FALSE(find_oprs<JITExecutor>(*f1).empty());
  95. return {std::move(f0), std::move(f1)};
  96. }
  97. template <>
  98. void run<void>(Backend, CompNode) {}
  99. template <>
  100. void run<basic>(Backend backend, CompNode cn) {
  101. set_backend(backend);
  102. HostTensorGenerator<> gen;
  103. auto host_x0 = gen({3, 3}, cn), host_x1 = gen({3, 1}, cn),
  104. host_x2 = gen({1, 1}, cn), host_x3 = gen({3, 1}, cn);
  105. auto make_dst = [&](ComputingGraph& graph) {
  106. auto a = opr::Host2DeviceCopy::make(graph, host_x0),
  107. b = opr::Host2DeviceCopy::make(graph, host_x1),
  108. c = opr::Host2DeviceCopy::make(graph, host_x2),
  109. d = opr::Host2DeviceCopy::make(graph, host_x3);
  110. return a * b + c * a + d + d + d;
  111. };
  112. HostTensorND host_z1, host_z2;
  113. auto funcs = make_func_pair(host_z1, host_z2, make_dst, 2);
  114. funcs.first->execute();
  115. funcs.second->execute();
  116. MGB_ASSERT_TENSOR_EQ(host_z1, host_z2);
  117. auto jits = find_oprs<JITExecutor>(*funcs.second);
  118. ASSERT_EQ(2u, jits.size());
  119. // only one broadcast is allowed in JIT fusion
  120. ASSERT_EQ(1u, jits[0]->input().size());
  121. ASSERT_EQ(4u, jits[1]->input().size());
  122. }
  123. template <>
  124. void run<shape_change>(Backend backend, CompNode cn) {
  125. set_backend(backend);
  126. HostTensorGenerator<> gen;
  127. auto host_x0 = gen({3, 3}, cn), host_x1 = gen({3, 1}, cn),
  128. host_x2 = gen({1, 1}, cn), host_x3 = gen({1, 3}, cn);
  129. auto run_gen = [&](size_t n, bool dim = false, bool swap = false) {
  130. if (dim) {
  131. host_x0->copy_from(*gen({n, n, 3}, cn));
  132. host_x1->copy_from(*gen({n, 1, 1}, cn));
  133. host_x2->copy_from(*gen({1, 1, 3}, cn));
  134. host_x3->copy_from(*gen({1, n, 1}, cn));
  135. } else {
  136. host_x0->copy_from(*gen({n, n}, cn));
  137. host_x1->copy_from(*gen({n, 1}, cn));
  138. host_x2->copy_from(*gen({1, 1}, cn));
  139. host_x3->copy_from(*gen({1, n}, cn));
  140. }
  141. if (swap) {
  142. std::swap(*host_x1, *host_x3);
  143. }
  144. };
  145. using JITOprArr = std::array<JITExecutor*, 2>;
  146. auto make_func = [&](HostTensorND& out, JITOprArr* jit) {
  147. auto graph = ComputingGraph::make();
  148. graph->options().graph_opt_level = 0;
  149. auto a = opr::Host2DeviceCopy::make(*graph, host_x0),
  150. b = opr::Host2DeviceCopy::make(*graph, host_x1),
  151. c = opr::Host2DeviceCopy::make(*graph, host_x2),
  152. d = opr::Host2DeviceCopy::make(*graph, host_x3);
  153. auto y = opr::abs(a) * (b + c) * d - (b + c) * c * b;
  154. if (jit) {
  155. graph->options().graph_opt_level = 3;
  156. }
  157. auto func = graph->compile({make_callback_copy(y, out)});
  158. if (jit) {
  159. unpack_vector(find_oprs<JITExecutor>(*func), (*jit)[0], (*jit)[1]);
  160. }
  161. return func;
  162. };
  163. JITOprArr jits;
  164. HostTensorND host_y1, host_y2;
  165. auto func1 = make_func(host_y1, nullptr), func2 = make_func(host_y2, &jits);
  166. auto run = [&]() -> std::array<Executable*, 2> {
  167. func1->execute();
  168. func2->execute();
  169. auto chk = [&]() { MGB_ASSERT_TENSOR_EQ(host_y1, host_y2); };
  170. chk();
  171. return {jits[0]->executable(), jits[1]->executable()};
  172. };
  173. auto exe_shp3 = run();
  174. {
  175. run_gen(5);
  176. auto exe_shp5 = run();
  177. if (backend == Backend::HALIDE) {
  178. ASSERT_NE(exe_shp3, exe_shp5);
  179. } else {
  180. ASSERT_EQ(exe_shp3, exe_shp5);
  181. }
  182. }
  183. // change ndim
  184. run_gen(3, true);
  185. ASSERT_NE(exe_shp3, run());
  186. // change bcast pattern
  187. {
  188. run_gen(3, false, true);
  189. auto exe_chg = run();
  190. if (backend == Backend::HALIDE) {
  191. ASSERT_NE(exe_shp3, exe_chg);
  192. } else {
  193. ASSERT_EQ(exe_shp3, exe_chg);
  194. }
  195. }
  196. run_gen(3);
  197. ASSERT_EQ(exe_shp3, run());
  198. }
  199. template <>
  200. void run<large_num_inps>(Backend backend, CompNode cn) {
  201. set_backend(backend);
  202. HostTensorGenerator<> gen;
  203. int inp_nr = 120;
  204. std::vector<std::shared_ptr<HostTensorND>> host_xs;
  205. for (int i = 0; i < inp_nr; i++)
  206. host_xs.push_back(gen({4, 3, 2, 1}, cn));
  207. auto make_dst = [&](ComputingGraph& graph) {
  208. std::vector<SymbolVar> dev_xs;
  209. for (int i = 0; i < inp_nr; i++)
  210. dev_xs.push_back(opr::Host2DeviceCopy::make(graph, host_xs[i]));
  211. auto y = dev_xs[0] + dev_xs[1];
  212. for (int i = 2; i < inp_nr; i++)
  213. y = y + dev_xs[i];
  214. return y;
  215. };
  216. HostTensorND host_y1, host_y2;
  217. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 2);
  218. funcs.first->execute();
  219. funcs.second->execute();
  220. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  221. ASSERT_GT(find_oprs<JITExecutor>(*funcs.second).size(), 1u);
  222. }
  223. template <>
  224. void run<concat_input>(Backend backend, CompNode cn) {
  225. set_backend(backend);
  226. FusionChecker checker{
  227. 4,
  228. [](const SymbolVarArray& inp) -> SymbolVar {
  229. auto spl = opr::Split::make(
  230. inp[0],
  231. opr::Split::Options::make_partition(inp[0], 1, {1, 1}));
  232. return spl[1] * inp[1] + inp[2] * spl[1] + inp[3] + inp[3];
  233. },
  234. cn};
  235. checker.disable_opr_type_check().run({TensorShape{3, 2}, {3, 1}, {3, 1}, {3, 1}});
  236. }
  237. template <>
  238. void run<simple_exp>(Backend backend, CompNode cn) {
  239. set_backend(backend);
  240. FusionChecker checker{2,
  241. [](const SymbolVarArray& inp) -> SymbolVar {
  242. return inp[0] + inp[1];
  243. },
  244. cn};
  245. checker.enable_direct_build().run({TensorShape{3, 3}, {3, 3}});
  246. }
  247. template <>
  248. void run<jit_grad>(Backend backend, CompNode cn) {
  249. set_backend(backend);
  250. FusionChecker checker{
  251. 1,
  252. [](const SymbolVarArray& inp) -> SymbolVar { return inp[0] + 1; },
  253. cn};
  254. checker.enable_direct_build().run({TensorShape{3, 1}});
  255. }
  256. template <>
  257. void run<exp_pow>(Backend backend, CompNode cn) {
  258. set_backend(backend);
  259. FusionChecker checker{
  260. 3,
  261. [](const SymbolVarArray& inp) -> SymbolVar {
  262. auto iabs = opr::abs(inp[0]) + .23f;
  263. return opr::exp(inp[0]) + opr::exp(inp[1]) -
  264. opr::exp(inp[2]) * opr::pow(opr::abs(inp[1]) + 0.2f,
  265. opr::abs(inp[2]) + 0.1f) +
  266. opr::powf(inp[0], 2) - opr::powf(inp[0], -3) +
  267. opr::powf(iabs, 1.f / 3.f) +
  268. opr::PowC::make(iabs, -1.f / 3.f) +
  269. opr::PowC::make(iabs, .5f) + opr::PowC::make(iabs, -.5f);
  270. },
  271. cn};
  272. checker.run({TensorShape{2, 3}, {2, 3}, {2, 3}});
  273. }
  274. template <>
  275. void run<complex_exp>(Backend backend, CompNode cn) {
  276. set_backend(backend);
  277. FusionChecker checker{4,
  278. [](const SymbolVarArray& inp) -> SymbolVar {
  279. return opr::abs(inp[0]) * (inp[1] + inp[2]) *
  280. inp[3] -
  281. (inp[1] + inp[2]) * inp[2] / inp[1];
  282. },
  283. cn};
  284. checker.run({TensorShape{3, 3}, {1, 3}, {3, 1}, {1, 3}});
  285. }
  286. template <>
  287. void run<cache>(Backend backend, CompNode cn) {
  288. set_backend(backend);
  289. auto graph = ComputingGraph::make();
  290. HostTensorGenerator<> gen;
  291. auto host_a = gen({1}, cn), host_b = gen({1}, cn), host_c = gen({1}, cn);
  292. auto a = opr::Host2DeviceCopy::make(*graph, host_a),
  293. b = opr::Host2DeviceCopy::make(*graph, host_b),
  294. c = opr::Host2DeviceCopy::make(*graph, host_c), x = opr::sin(a + 1),
  295. y = opr::cos(b + 1), z = opr::sin(c + 1);
  296. gopt::GraphOptimizer gopt;
  297. gopt.add_pass<gopt::JITFusionPass>();
  298. VarNodeArray vars{x.node(), y.node(), z.node()};
  299. gopt.apply_inplace(vars);
  300. ASSERT_NE(vars[0], vars[1]);
  301. ASSERT_NE(vars[0], vars[2]);
  302. ASSERT_NE(vars[1], vars[2]);
  303. auto func = graph->compile({{vars[0], {}}, {vars[1], {}}, {vars[2], {}}});
  304. func->execute();
  305. auto get_exe = [](SymbolVar var) {
  306. return var.node()
  307. ->owner_opr()
  308. ->cast_final_safe<JITExecutor>()
  309. .executable();
  310. };
  311. auto ex0 = get_exe(vars[0]), ex1 = get_exe(vars[1]), ex2 = get_exe(vars[2]);
  312. ASSERT_EQ(ex0, ex2);
  313. ASSERT_NE(ex0, ex1);
  314. }
  315. template <>
  316. void run<all_oprs>(Backend backend, CompNode cn) {
  317. // test all supported modes in multiple threads
  318. set_backend(backend);
  319. std::vector<std::pair<const char*, thin_function<void()>>> tasks;
  320. static auto itrans_none = [](SymbolVar* data, size_t size) {};
  321. static auto itrans_pos = [](SymbolVar* data, size_t size) {
  322. for (size_t i = 0; i < size; ++i) {
  323. data[i] = opr::abs(data[i]) + float(0.1f + 0.23f * i);
  324. }
  325. };
  326. static auto itrans_clip1 = [](SymbolVar* data, size_t size) {
  327. for (size_t i = 0; i < size; ++i) {
  328. data[i] = opr::max(opr::min(data[i], data[i].make_scalar_dt(0.9f)),
  329. data[i].make_scalar_dt(-0.9f));
  330. }
  331. };
  332. static auto itrans_gt0 = [](SymbolVar* data, size_t size) {
  333. for (size_t i = 0; i < size; ++i) {
  334. data[i] = opr::max(data[i], data[i].make_scalar_dt(0.1f));
  335. }
  336. };
  337. static auto itrans_ne0 = [](SymbolVar* data, size_t size) {
  338. for (size_t i = 0; i < size; ++i) {
  339. auto mask = opr::abs(data[i]) < 0.1f;
  340. data[i] = data[i] * (1.f - mask) + mask * (data[i] + 1.f);
  341. }
  342. };
  343. #define DO_CHK_ELEM(_mode, _arity, _do_grad, _itrans, _shps...) \
  344. tasks.emplace_back(#_mode, [cn]() { \
  345. FusionChecker chk{_arity, \
  346. [](SymbolVarArray inp) -> SymbolVar { \
  347. itrans_##_itrans(inp.data(), inp.size()); \
  348. return opr::Elemwise::make( \
  349. inp, opr::Elemwise::Mode::_mode); \
  350. }, \
  351. cn}; \
  352. chk.enable_direct_build(); \
  353. if (!_do_grad) { \
  354. chk.disable_inp_grad(); \
  355. } \
  356. chk.run({_shps}); \
  357. })
  358. #define CHECK_ELEM1(_mode, _do_grad, _itrans) \
  359. DO_CHK_ELEM(_mode, 1, _do_grad, _itrans, TensorShape{9, 12, 7})
  360. #define CHECK_ELEM2(_mode, _do_grad, _itrans) \
  361. DO_CHK_ELEM(_mode, 2, _do_grad, _itrans, TensorShape{9, 12, 7}, \
  362. TensorShape{9, 1, 7})
  363. #define CHECK_ELEM3(_mode, _do_grad, _itrans) \
  364. DO_CHK_ELEM(_mode, 3, _do_grad, _itrans, TensorShape{9, 12, 7}, \
  365. TensorShape{9, 1, 7}, TensorShape{1, 12, 7})
  366. #define CHECK_ELEM4(_mode, _do_grad, _itrans) \
  367. DO_CHK_ELEM(_mode, 4, _do_grad, _itrans, TensorShape{9, 12, 7}, \
  368. TensorShape{9, 1, 7}, TensorShape{1, 12, 7}, \
  369. TensorShape{9, 12, 1})
  370. CHECK_ELEM1(RELU, true, none);
  371. CHECK_ELEM1(ABS, true, none);
  372. CHECK_ELEM1(ACOS, true, clip1);
  373. CHECK_ELEM1(ASIN, true, clip1);
  374. CHECK_ELEM1(CEIL, false, none);
  375. CHECK_ELEM1(COS, true, none);
  376. CHECK_ELEM1(EXP, true, none);
  377. CHECK_ELEM1(EXPM1, true, none);
  378. CHECK_ELEM1(FLOOR, false, none);
  379. CHECK_ELEM1(LOG, true, gt0);
  380. CHECK_ELEM1(LOG1P, true, gt0);
  381. CHECK_ELEM1(NEGATE, true, none);
  382. CHECK_ELEM1(SIGMOID, true, none);
  383. CHECK_ELEM1(SIN, true, none);
  384. CHECK_ELEM1(TANH, true, none);
  385. CHECK_ELEM1(ERF, true, none);
  386. CHECK_ELEM1(ERFC, true, none);
  387. CHECK_ELEM1(H_SWISH, true, none);
  388. CHECK_ELEM2(ABS_GRAD, true, none);
  389. CHECK_ELEM2(ADD, true, none);
  390. CHECK_ELEM2(FLOOR_DIV, false, ne0);
  391. CHECK_ELEM2(MAX, true, none);
  392. CHECK_ELEM2(MIN, true, none);
  393. CHECK_ELEM2(MOD, false, ne0);
  394. CHECK_ELEM2(MUL, true, none);
  395. CHECK_ELEM2(POW, true, pos);
  396. CHECK_ELEM2(SIGMOID_GRAD, true, none);
  397. CHECK_ELEM2(SUB, true, none);
  398. CHECK_ELEM2(SWITCH_GT0, true, none);
  399. CHECK_ELEM2(TANH_GRAD, true, none);
  400. CHECK_ELEM2(TRUE_DIV, true, ne0);
  401. CHECK_ELEM2(LOG_SUM_EXP, true, none);
  402. CHECK_ELEM2(H_SWISH_GRAD, false, none);
  403. CHECK_ELEM2(LT, false, none);
  404. CHECK_ELEM2(LEQ, false, none);
  405. CHECK_ELEM2(EQ, false, none);
  406. CHECK_ELEM2(ATAN2, true, gt0);
  407. CHECK_ELEM3(COND_LEQ_MOV, false, none);
  408. CHECK_ELEM3(FUSE_MUL_ADD3, true, none);
  409. CHECK_ELEM4(FUSE_MUL_ADD4, true, none);
  410. CHECK_ELEM2(FUSE_ADD_RELU, true, none);
  411. CHECK_ELEM2(FUSE_ADD_SIGMOID, true, none);
  412. CHECK_ELEM2(FUSE_ADD_TANH, true, none);
  413. CHECK_ELEM2(FUSE_ADD_H_SWISH, true, none);
  414. ASSERT_EQ(ast_c::elem_opr_generator().size(), tasks.size());
  415. auto type_cvt_test = [&](const char* name, DType src_dtype,
  416. DType dst_dtype) {
  417. tasks.emplace_back(name, [cn, src_dtype, dst_dtype]() {
  418. FusionChecker checker{
  419. 1,
  420. [dst_dtype](const SymbolVarArray& inp) -> SymbolVar {
  421. return opr::TypeCvt::make(inp[0], dst_dtype);
  422. },
  423. cn};
  424. checker.enable_direct_build();
  425. checker.set_dtype(0, src_dtype).run({TensorShape{4, 7, 99, 1}});
  426. });
  427. };
  428. type_cvt_test("f16->f32", dtype::Float16(), dtype::Float32());
  429. type_cvt_test("f32->f16", dtype::Float32(), dtype::Float16());
  430. #undef CHECK_ELEM1
  431. #undef CHECK_ELEM2
  432. #undef CHECK_ELEM3
  433. #undef CHECK_ELEM4
  434. #undef DO_CHK_ELEM
  435. std::vector<std::thread> workers;
  436. std::atomic_size_t finished_tasks{0};
  437. auto worker = [&tasks, &finished_tasks](int wid) {
  438. for (;;) {
  439. size_t id = finished_tasks.fetch_add(1);
  440. if (id >= tasks.size()) {
  441. return;
  442. }
  443. if (!::testing::Test::HasFailure()) {
  444. mgb_log("going to run %s on worker %d", tasks[id].first, wid);
  445. ASSERT_NO_THROW(tasks[id].second())
  446. << "failed for " << tasks[id].first;
  447. }
  448. }
  449. };
  450. int nr_worker;
  451. if (auto set = MGB_GETENV("MGB_JIT_TEST_WORKER")) {
  452. nr_worker = std::stoi(set);
  453. } else {
  454. nr_worker = CompNode::get_device_count(CompNode::DeviceType::CPU) / 2;
  455. }
  456. if (nr_worker == 1) {
  457. worker(-1);
  458. } else {
  459. for (int i = 0; i < nr_worker; ++i) {
  460. workers.emplace_back(worker, i);
  461. }
  462. for (auto&& i : workers) {
  463. i.join();
  464. }
  465. }
  466. ASSERT_GE(finished_tasks.load(), tasks.size());
  467. }
  468. template <>
  469. void run<expand_jit_executor>(Backend backend, CompNode cn) {
  470. set_backend(backend);
  471. auto make_jit = [](SymbolVar target, const SymbolVarArray& inputs) {
  472. auto y = target.node();
  473. auto ig_gen = std::make_unique<InternalGraphGenerator>(y->owner_opr());
  474. auto inputs_vptr = cg::to_var_node_array(inputs);
  475. for (auto i : get_rev_topo_order(
  476. target, {inputs_vptr.begin(), inputs_vptr.end()})) {
  477. ig_gen->add_opr(i);
  478. }
  479. auto igraph = ig_gen->generate();
  480. return JITExecutor::make(igraph, ig_gen->orig_inps());
  481. };
  482. auto graph = ComputingGraph::make();
  483. graph->options().graph_opt_level = 3;
  484. HostTensorGenerator<> gen;
  485. auto host_x = gen({3, 3}, cn);
  486. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  487. auto type_cvt_x = opr::TypeCvt::make(x, dtype::Float16());
  488. auto relu_x = opr::relu(type_cvt_x);
  489. auto sin_x = opr::sin(relu_x);
  490. auto host_y = gen({3, 3}, cn);
  491. auto y = opr::Host2DeviceCopy::make(*graph, host_y);
  492. auto type_cvt_y = opr::TypeCvt::make(y, dtype::Float16());
  493. auto relu_y = opr::relu(type_cvt_y);
  494. auto sin_y = opr::sin(relu_y);
  495. auto fusion_x = make_jit(sin_x, {relu_x});
  496. auto fusion_y = make_jit(sin_y, {type_cvt_y});
  497. auto z = fusion_x + fusion_y;
  498. // expanding at endpoint
  499. auto fusion0_x = make_jit(sin_x, {type_cvt_x});
  500. auto fusion1_x = make_jit(fusion0_x, {x});
  501. auto fusion2_x = make_jit(sin_x, {x});
  502. ASSERT_EQ(fusion1_x, fusion2_x);
  503. // expand mulitple JITExecutor
  504. auto fusion_z = make_jit(z, {x, y});
  505. auto fusion_z_expected = make_jit(sin_x + sin_y, {x, y});
  506. ASSERT_EQ(fusion_z, fusion_z_expected);
  507. }
  508. SymbolVar jit_stop(SymbolVar x) {
  509. return opr::Sleep::make(x, 1e-3);
  510. }
  511. template <>
  512. void run<multi_device>(Backend backend, CompNode cn) {
  513. set_backend(backend);
  514. auto loc = cn.locator_logical();
  515. mgb_assert(loc.device >= 0);
  516. loc.device += 1;
  517. if (loc.device >= static_cast<int>(CompNode::get_device_count(loc.type))) {
  518. return;
  519. }
  520. HostTensorGenerator<> gen;
  521. auto cn1 = CompNode::load(loc);
  522. auto host_x = gen({42, 23}, cn);
  523. auto make_dst = [&](ComputingGraph& graph) {
  524. auto x = opr::Host2DeviceCopy::make(graph, host_x),
  525. a = opr::tanh(x) + opr::sin(x), y = opr::Copy::make(x, cn1),
  526. b = opr::tanh(y) + opr::sin(y);
  527. return jit_stop(a) + opr::Copy::make(b, cn);
  528. };
  529. HostTensorND host_z1, host_z2;
  530. auto funcs = make_func_pair(host_z1, host_z2, make_dst, 2);
  531. for (int i = 0; i < 8; ++i) {
  532. funcs.first->execute();
  533. funcs.second->execute();
  534. if (i == 4) {
  535. host_x->copy_from(*gen({10, 20, 3}, cn));
  536. } else {
  537. host_x->copy_from(*gen(host_x->shape(), cn));
  538. }
  539. MGB_ASSERT_TENSOR_EQ(host_z1, host_z2);
  540. }
  541. auto jits = find_oprs<JITExecutor>(*funcs.second);
  542. ASSERT_EQ(2u, jits.size());
  543. ASSERT_EQ(jits[0]->internal_graph().output(),
  544. jits[1]->internal_graph().output());
  545. }
  546. template <>
  547. void run<multi_shape>(Backend backend, CompNode cn) {
  548. // multiple shapes of same computing expr
  549. set_backend(backend);
  550. HostTensorGenerator<> gen;
  551. auto host_x = gen({4, 2, 3}, cn), host_y = gen({4, 2}, cn);
  552. auto make_dst = [&](ComputingGraph& graph) {
  553. auto x = opr::Host2DeviceCopy::make(graph, host_x).rename("x"),
  554. y = opr::Host2DeviceCopy::make(graph, host_y).rename("y"),
  555. jit0 = jit_stop(opr::sin(x) * x),
  556. a = opr::AxisAddRemove::make(
  557. opr::Reduce::make(jit0,
  558. {opr::Reduce::Param::Mode::SUM, 2}),
  559. {opr::AxisAddRemove::AxisDesc::make_remove(2)}),
  560. jit1 = jit_stop(opr::sin(a) + opr::sin(y)),
  561. jit2 = opr::sin(jit1) * jit1;
  562. return jit2;
  563. };
  564. HostTensorND host_z1, host_z2;
  565. auto funcs = make_func_pair(host_z1, host_z2, make_dst, 2);
  566. auto jits = find_oprs<JITExecutor>(*funcs.second);
  567. ASSERT_EQ(3u, jits.size());
  568. ASSERT_EQ(jits[0]->internal_graph().output(),
  569. jits[2]->internal_graph().output());
  570. for (int i = 0; i < 8; ++i) {
  571. funcs.first->execute();
  572. funcs.second->execute();
  573. if (i == 4) {
  574. host_x->copy_from(*gen({3, 7, 5}, cn));
  575. host_y->copy_from(*gen({3, 7}, cn));
  576. } else {
  577. host_x->copy_from(*gen(host_x->shape(), cn));
  578. host_y->copy_from(*gen(host_y->shape(), cn));
  579. }
  580. MGB_ASSERT_TENSOR_EQ(host_z1, host_z2);
  581. }
  582. }
  583. template <>
  584. void run<non_contig>(Backend backend, CompNode cn) {
  585. set_backend(backend);
  586. HostTensorGenerator<> gen;
  587. auto host_x = gen({2, 3}, cn);
  588. SmallVector<std::pair<SymbolVar, SymbolVar>> subs;
  589. auto make_dst = [&](ComputingGraph& graph) {
  590. auto x = opr::Host2DeviceCopy::make(graph, host_x),
  591. y = opr::Subtensor::make(
  592. x, {opr::Subtensor::AxisIndexer::make_interval(
  593. 1, x.make_scalar(1), x.make_scalar(3), None)});
  594. subs.emplace_back(x, y);
  595. return opr::sin(y) * y;
  596. };
  597. HostTensorND y0, y1;
  598. auto funcs = make_func_pair(y0, y1, make_dst, 2);
  599. for (size_t s : {4, 7}) {
  600. *host_x = *gen({3, s});
  601. funcs.first->execute();
  602. funcs.second->execute();
  603. MGB_ASSERT_TENSOR_EQ(y0, y1);
  604. }
  605. ASSERT_EQ(2u, subs.size());
  606. for (int i = 0; i < 2; ++i) {
  607. auto p0 = static_cast<const float*>(prev_dev_ptr(subs[i].first)) + 1,
  608. p1 = static_cast<const float*>(prev_dev_ptr(subs[i].second));
  609. if (backend != Backend::HALIDE || !i) {
  610. ASSERT_EQ(p0, p1);
  611. } else {
  612. ASSERT_NE(p0, p1);
  613. }
  614. }
  615. }
  616. template <>
  617. void run<visit_complexity>(Backend backend, CompNode cn) {
  618. // build a graph that would have exponential complexity if graph visiting is
  619. // not correctly implemented
  620. set_backend(backend);
  621. HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{0.01f,
  622. 0.02f};
  623. auto host_x = gen({3, 4}, cn);
  624. auto make_dst = [&](ComputingGraph& graph) {
  625. auto x = opr::Host2DeviceCopy::make(graph, host_x);
  626. auto y = x;
  627. for (int i = 0; i < 32; ++i) {
  628. y = y * y + y;
  629. }
  630. return y;
  631. };
  632. HostTensorND host_y1, host_y2;
  633. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 2);
  634. funcs.first->execute();
  635. funcs.second->execute();
  636. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  637. ASSERT_EQ(1u, find_oprs<JITExecutor>(*funcs.second).size());
  638. ASSERT_TRUE(find_oprs<opr::Elemwise>(*funcs.second).empty());
  639. }
  640. template <>
  641. void run<imm_scalar>(Backend backend, CompNode cn) {
  642. set_backend(backend);
  643. HostTensorGenerator<> gen;
  644. auto host_x = gen({2, 3, 4}, cn);
  645. auto make_dst = [&](ComputingGraph& graph) {
  646. auto x = opr::Host2DeviceCopy::make(graph, host_x);
  647. return (x * x + 1.f) / (opr::sin(x) + 1.2f) * .3f;
  648. };
  649. HostTensorND host_y1, host_y2;
  650. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 2);
  651. funcs.first->execute();
  652. funcs.second->execute();
  653. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  654. JITExecutor* jit;
  655. unpack_vector(find_oprs<JITExecutor>(*funcs.second), jit);
  656. ASSERT_TRUE(find_oprs<opr::Elemwise>(*funcs.second).empty());
  657. ASSERT_EQ(1u, jit->input().size());
  658. ASSERT_TRUE(jit->input(0)->owner_opr()->same_type<opr::Host2DeviceCopy>());
  659. }
  660. template <>
  661. void run<special_graph_input>(Backend backend, CompNode cn) {
  662. set_backend(backend);
  663. HostTensorGenerator<> gen;
  664. auto host_x = gen({3, 3}, cn);
  665. auto host_y = gen({2, 1}, cn);
  666. auto make_dst = [&](ComputingGraph& graph) {
  667. auto x = opr::Host2DeviceCopy::make(graph, host_x);
  668. auto y = opr::Host2DeviceCopy::make(graph, host_y);
  669. auto spl = opr::Split::make(x,
  670. opr::Split::Options::make_partition(x, 1, {1, 2}));
  671. auto mat = mgb::opr::MatrixMul::make(spl[1], y);
  672. return (spl[0] * spl[0] + 1.f) / (mat + 1.2f) * .3f;
  673. };
  674. HostTensorND host_y1, host_y2;
  675. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 2);
  676. funcs.first->execute();
  677. funcs.second->execute();
  678. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  679. JITExecutor* jit;
  680. unpack_vector(find_oprs<JITExecutor>(*funcs.second), jit);
  681. ASSERT_TRUE(find_oprs<opr::Elemwise>(*funcs.second).empty());
  682. ASSERT_EQ(2u, jit->input().size());
  683. }
  684. } // namespace
  685. #if MGB_JIT_HALIDE
  686. TEST(TestJITFusionHalide, SimpleReduce) {
  687. REQUIRE_GPU(1);
  688. set_backend(Backend::HALIDE);
  689. auto graph = ComputingGraph::make();
  690. graph->options().graph_opt_level = 3;
  691. graph->options().graph_opt.jit = 2;
  692. HostTensorGenerator<> gen;
  693. auto host_x0 = gen({3, 3}), host_x1 = gen({3, 1});
  694. auto a = opr::Host2DeviceCopy::make(*graph, host_x0),
  695. b = opr::Host2DeviceCopy::make(*graph, host_x1),
  696. y = opr::reduce_sum(a + b, opr::GetVarShape::make(b)),
  697. z = opr::reduce_sum(a * b, opr::GetVarShape::make(a)) + y;
  698. SymbolVar z_opt;
  699. unpack_vector(gopt::GraphOptimizer{}
  700. .add_preset_passes(true, nullptr, &(graph->options()))
  701. .apply({{z}})
  702. .endpoint_vars(),
  703. z_opt);
  704. ASSERT_EQ(2u, find_opr_num<mgb::jit::JITExecutor>(z_opt));
  705. HostTensorND h;
  706. graph->compile({make_callback_copy(z_opt, h)})
  707. ->to_json()
  708. ->writeto_fpath(
  709. output_file("TestJITFusionHalide.SimpleReduce.json"));
  710. }
  711. TEST(TestJITFusionHalide, JITExecutor) {
  712. REQUIRE_GPU(1);
  713. set_backend(Backend::HALIDE);
  714. auto graph = ComputingGraph::make();
  715. graph->options().graph_opt_level = 3;
  716. graph->options().graph_opt.jit = 2;
  717. HostTensorGenerator<> gen;
  718. auto host_x0 = gen({3, 3}), host_x1 = gen({3, 1}), host_x2 = gen({3, 3}),
  719. host_x3 = gen({3, 1});
  720. auto a = opr::Host2DeviceCopy::make(*graph, host_x0),
  721. b = opr::Host2DeviceCopy::make(*graph, host_x1),
  722. c = opr::Host2DeviceCopy::make(*graph, host_x2),
  723. d = opr::Host2DeviceCopy::make(*graph, host_x3),
  724. shape_of_b = opr::GetVarShape::make(b),
  725. shape_of_a = opr::GetVarShape::make(a),
  726. y = opr::reduce_sum(a + b, shape_of_b),
  727. z = opr::reduce_sum(a * b, shape_of_a);
  728. auto ig_gen_1 =
  729. std::make_unique<InternalGraphGenerator>(y.node()->owner_opr());
  730. auto ig_gen_2 =
  731. std::make_unique<InternalGraphGenerator>(z.node()->owner_opr());
  732. {
  733. ThinHashSet<VarNode*> nd_set;
  734. nd_set.insert(a.node());
  735. nd_set.insert(b.node());
  736. nd_set.insert(shape_of_b.node());
  737. auto topo = get_rev_topo_order(y, nd_set);
  738. for (auto opr : topo) {
  739. ig_gen_1->add_opr(opr);
  740. }
  741. }
  742. {
  743. ThinHashSet<VarNode*> nd_set;
  744. nd_set.insert(a.node());
  745. nd_set.insert(b.node());
  746. nd_set.insert(shape_of_a.node());
  747. auto topo = get_rev_topo_order(z, nd_set);
  748. for (auto opr : topo) {
  749. ig_gen_2->add_opr(opr);
  750. }
  751. }
  752. auto ig_1 = ig_gen_1->generate(), ig_2 = ig_gen_2->generate();
  753. auto jit_1 = JITExecutor::make(ig_1, ig_gen_1->orig_inps());
  754. auto jit_2 = JITExecutor::make(ig_2, ig_gen_2->orig_inps());
  755. auto w = opr::reduce_sum(a * b + c * d, opr::GetVarShape::make(a)),
  756. x = w + jit_1, u = x * jit_2;
  757. SymbolVar u_opt;
  758. unpack_vector(gopt::GraphOptimizer{}
  759. .add_preset_passes(true, nullptr, &(graph->options()))
  760. .apply({{u}})
  761. .endpoint_vars(),
  762. u_opt);
  763. ASSERT_EQ(2u, find_opr_num<mgb::jit::JITExecutor>(u_opt));
  764. ASSERT_GT(1u, find_opr_num<opr::Elemwise>(u_opt));
  765. HostTensorND h;
  766. graph->compile({make_callback_copy(u_opt, h)})
  767. ->to_json()
  768. ->writeto_fpath(
  769. output_file("TestJITFusionHalide.JITExecutor.json"));
  770. }
  771. TEST(TestJITFusionHalide, BatchNormalization) {
  772. REQUIRE_GPU(1);
  773. set_backend(Backend::HALIDE);
  774. auto graph1 = ComputingGraph::make();
  775. graph1->options().graph_opt_level = 3;
  776. graph1->options().graph_opt.jit = 2;
  777. HostTensorGenerator<dtype::Float32, RandomDistribution::UNIFORM> gen{0.1,
  778. 1};
  779. size_t n = 32, c = 24, h = 28, w = 28;
  780. auto host_x0 = gen({n, c, h, w});
  781. auto host_tshp = std::make_shared<HostTensorND>(host_x0->comp_node(),
  782. dtype::Int32());
  783. host_tshp->resize({4});
  784. host_tshp->ptr<int>()[0] = 1;
  785. host_tshp->ptr<int>()[1] = c;
  786. host_tshp->ptr<int>()[2] = 1;
  787. host_tshp->ptr<int>()[3] = 1;
  788. auto host_pow = std::make_shared<HostTensorND>(host_x0->comp_node(),
  789. dtype::Float32());
  790. host_pow->resize({1});
  791. host_pow->ptr<float>()[0] = -0.5;
  792. auto pow = opr::Host2DeviceCopy::make(*graph1, host_pow, {"pow"});
  793. auto x = opr::Host2DeviceCopy::make(*graph1, host_x0, {"x"}),
  794. tshp = opr::Host2DeviceCopy::make(*graph1, host_tshp, {"tshp"});
  795. auto xshp = opr::GetVarShape::make(x);
  796. auto reduce_size = opr::reduce_prod(xshp, xshp.make_scalar(1)) /
  797. opr::reduce_prod(tshp, tshp.make_scalar(1));
  798. auto xx = opr::Elemwise::make({2 * x}, opr::Elemwise::Param::Mode::RELU);
  799. auto x1 = opr::reduce_sum(xx, tshp);
  800. auto x2 = opr::reduce_sum_sqr(xx, tshp);
  801. auto var = (x2 - x1 * x1 / reduce_size) / (reduce_size - 1),
  802. regular_var = var + (float)(1e-5);
  803. auto invsqrt_var = opr::Elemwise::make({regular_var, pow},
  804. opr::Elemwise::Param::Mode::POW);
  805. auto ovar = (x - x1 / reduce_size) * invsqrt_var;
  806. HostTensorND h_ovar;
  807. using Callback = thin_function<void(DeviceTensorND&)>;
  808. using OutputSpecItem = std::pair<SymbolVar, Callback>;
  809. using OutputSpec = std::vector<OutputSpecItem>;
  810. OutputSpec out_spec;
  811. out_spec.push_back(make_callback_copy(ovar, h_ovar));
  812. HostTensorND h_grad;
  813. bool do_grad = true;
  814. if (do_grad) {
  815. auto reduce_ovar = opr::reduce_sum(ovar * ovar, ovar.make_scalar(1));
  816. auto grad = cg::grad(reduce_ovar, x);
  817. out_spec.push_back(make_callback_copy(grad, h_grad));
  818. }
  819. auto func1 = graph1->compile(out_spec);
  820. func1->to_json()->writeto_fpath(
  821. output_file("TestJITFusionHalide.BatchNormalization.json"));
  822. func1->execute();
  823. auto graph2 = ComputingGraph::make();
  824. graph2->options().graph_opt_level = 0;
  825. auto pow_ = opr::Host2DeviceCopy::make(*graph2, host_pow, {"pow"});
  826. auto x_ = opr::Host2DeviceCopy::make(*graph2, host_x0, {"x"}),
  827. tshp_ = opr::Host2DeviceCopy::make(*graph2, host_tshp, {"tshp"});
  828. auto xshp_ = opr::GetVarShape::make(x_);
  829. auto reduce_size_ = opr::reduce_prod(xshp_, xshp_.make_scalar(1)) /
  830. opr::reduce_prod(tshp_, tshp_.make_scalar(1));
  831. auto xx_ = opr::Elemwise::make({2 * x_}, opr::Elemwise::Param::Mode::RELU);
  832. auto x1_ = opr::reduce_sum(xx_, tshp_);
  833. auto x2_ = opr::reduce_sum_sqr(xx_, tshp_);
  834. auto var_ = (x2_ - x1_ * x1_ / reduce_size_) / (reduce_size_ - 1),
  835. regular_var_ = var_ + (float)(1e-5);
  836. auto invsqrt_var_ = opr::Elemwise::make({regular_var_, pow_},
  837. opr::Elemwise::Param::Mode::POW);
  838. auto ovar_ = (x_ - x1_ / reduce_size_) * invsqrt_var_;
  839. HostTensorND h_ovar_;
  840. OutputSpec out_spec_;
  841. out_spec_.push_back(make_callback_copy(ovar_, h_ovar_));
  842. HostTensorND h_grad_;
  843. if (do_grad) {
  844. auto reduce_ovar = opr::reduce_sum(ovar_ * ovar_, ovar_.make_scalar(1));
  845. auto grad = cg::grad(reduce_ovar, x_);
  846. out_spec_.push_back(make_callback_copy(grad, h_grad_));
  847. }
  848. auto func2 = graph2->compile(out_spec_);
  849. func2->execute();
  850. MGB_ASSERT_TENSOR_NEAR(h_ovar_, h_ovar, 3e-5);
  851. if (do_grad){
  852. MGB_ASSERT_TENSOR_NEAR(h_grad_, h_grad, 3e-4);
  853. }
  854. }
  855. TEST(TestJITFusionHalide, ReduceShapeManip) {
  856. REQUIRE_GPU(1);
  857. set_backend(Backend::HALIDE);
  858. auto cn = CompNode::load("gpu0");
  859. HostTensorGenerator<> gen;
  860. auto do_chk = [&](bool dyn_shape) {
  861. auto host_x = gen({7, 8, 9}, cn);
  862. // TODO: handle opr fusion without shape constraints, and test dynamic
  863. // shape case where target shape can be inferred
  864. auto make_dst = [&host_x, dyn_shape](ComputingGraph& cg) {
  865. auto x = opr::Host2DeviceCopy::make(cg, host_x), xm2 = x * 2,
  866. one = x.make_scalar(1),
  867. tshp = opr::Concat::make(
  868. {one,
  869. opr::GetVarShape::make(
  870. dyn_shape ? opr::MarkDynamicVar::make(xm2)
  871. : xm2,
  872. 1),
  873. one},
  874. 0),
  875. y = opr::reduce_sum(xm2, tshp) + 3;
  876. return y;
  877. };
  878. HostTensorND host_y0, host_y1;
  879. auto funcs = make_func_pair(host_y0, host_y1, make_dst, 2);
  880. auto run = [&]() {
  881. funcs.first->execute();
  882. funcs.second->execute();
  883. MGB_ASSERT_TENSOR_NEAR(host_y0, host_y1, 1e-5);
  884. };
  885. funcs.second->to_json()->writeto_fpath(output_file(ssprintf(
  886. "TestJITFusionHalide.ReduceShapeManip%d.json", dyn_shape)));
  887. run();
  888. host_x->copy_from(*gen({13, 4, 5}, cn));
  889. run();
  890. if (!dyn_shape) {
  891. JITExecutor* jit;
  892. unpack_vector(find_oprs<JITExecutor>(*funcs.second), jit);
  893. ASSERT_TRUE(jit->input(0)
  894. ->owner_opr()
  895. ->same_type<opr::Host2DeviceCopy>());
  896. ASSERT_EQ(2u, jit->input().size());
  897. auto dep_type = jit->node_prop().dep_map().at(jit->input(1));
  898. ASSERT_EQ(cg::OperatorNodeBase::NodeProp::DepType::HOST_VALUE,
  899. dep_type);
  900. ASSERT_EQ(0u, find_oprs<opr::Elemwise>(*funcs.second).size());
  901. }
  902. };
  903. do_chk(false);
  904. do_chk(true);
  905. }
  906. TEST(TestJITFusionHalide, ReduceExp) {
  907. REQUIRE_GPU(1);
  908. set_backend(Backend::HALIDE);
  909. FusionChecker checker{
  910. 2,
  911. [](const SymbolVarArray& inp) -> SymbolVar {
  912. auto var1 =
  913. opr::reduce_sum(inp[0], opr::GetVarShape::make(inp[1]));
  914. auto var2 = opr::reduce_sum_sqr(inp[0] + inp[1],
  915. opr::GetVarShape::make(inp[1]));
  916. return var1 + var2;
  917. },
  918. CompNode::load("gpu0")};
  919. checker.run({TensorShape{3, 3}, {3, 1}});
  920. checker.run({TensorShape{3, 3}, {1}}); // to scalar
  921. }
  922. TEST(TestJITFusionHalide, ReduceO16xC32) {
  923. REQUIRE_GPU(1);
  924. set_backend(Backend::HALIDE);
  925. using DataType = opr::Reduce::Param::DataType;
  926. FusionChecker checker{
  927. 2,
  928. [](const SymbolVarArray& inp) -> SymbolVar {
  929. auto var1 = opr::Reduce::make(
  930. inp[0],
  931. {opr::Reduce::Mode::SUM, 1, DataType::FLOAT_O16xC32},
  932. {});
  933. auto var2 = opr::Reduce::make(inp[0],
  934. {opr::Reduce::Mode::SUM_SQR, 1,
  935. DataType::FLOAT_O16xC32},
  936. {});
  937. return var1 + var2;
  938. },
  939. CompNode::load("gpu0")};
  940. checker.disable_inp_grad().run({TensorShape{3, 3}, {3, 1}});
  941. }
  942. TEST(TestJITFusionHalide, ReduceSum) {
  943. REQUIRE_GPU(1);
  944. set_backend(Backend::HALIDE);
  945. FusionChecker checker{2,
  946. [](const SymbolVarArray& inp) -> SymbolVar {
  947. auto var1 = opr::reduce_sum(
  948. inp[0], opr::GetVarShape::make(inp[1]));
  949. return var1 + inp[1];
  950. },
  951. CompNode::load("gpu0")};
  952. checker.run({TensorShape{3, 3}, {3, 1}});
  953. checker.run({TensorShape{3, 3}, {1}}); // test reduce to scalar
  954. }
  955. TEST(TestJITFusionHalide, ReduceSumSqr) {
  956. REQUIRE_GPU(1);
  957. set_backend(Backend::HALIDE);
  958. FusionChecker checker{2,
  959. [](const SymbolVarArray& inp) -> SymbolVar {
  960. auto var1 = opr::reduce_sum_sqr(
  961. inp[0], opr::GetVarShape::make(inp[1]));
  962. return var1 + inp[1];
  963. },
  964. CompNode::load("gpu0")};
  965. checker.run({TensorShape{3, 3}, {3, 1}});
  966. checker.run({TensorShape{3, 3}, {3, 3}}); // test side effect
  967. }
  968. TEST(TestJITFusionHalide, ReduceMax) {
  969. REQUIRE_GPU(1);
  970. set_backend(Backend::HALIDE);
  971. FusionChecker checker{2,
  972. [](const SymbolVarArray& inp) -> SymbolVar {
  973. auto var1 = opr::reduce_max(
  974. inp[0], opr::GetVarShape::make(inp[1]));
  975. return var1 + inp[1];
  976. },
  977. CompNode::load("gpu0")};
  978. checker.run({TensorShape{3, 3}, {3, 1}});
  979. }
  980. TEST(TestJITFusionHalide, ReduceMin) {
  981. REQUIRE_GPU(1);
  982. set_backend(Backend::HALIDE);
  983. FusionChecker checker{2,
  984. [](const SymbolVarArray& inp) -> SymbolVar {
  985. auto var1 = opr::reduce_min(
  986. inp[0], opr::GetVarShape::make(inp[1]));
  987. return var1 + inp[1];
  988. },
  989. CompNode::load("gpu0")};
  990. checker.run({TensorShape{3, 3}, {3, 1}});
  991. }
  992. TEST(TestJITFusionHalide, ReduceProduct) {
  993. REQUIRE_GPU(1);
  994. set_backend(Backend::HALIDE);
  995. FusionChecker checker{2,
  996. [](const SymbolVarArray& inp) -> SymbolVar {
  997. auto var1 = opr::reduce_prod(
  998. inp[0], opr::GetVarShape::make(inp[1]));
  999. return var1 + inp[1];
  1000. },
  1001. CompNode::load("gpu0")};
  1002. checker.run({TensorShape{3, 3}, {3, 1}});
  1003. }
  1004. TEST(TestJITFusionHalide, ReduceMean) {
  1005. REQUIRE_GPU(1);
  1006. set_backend(Backend::HALIDE);
  1007. FusionChecker checker{2,
  1008. [](const SymbolVarArray& inp) -> SymbolVar {
  1009. auto var1 = opr::Reduce::make(
  1010. inp[0], opr::Reduce::Param::Mode::MEAN,
  1011. opr::GetVarShape::make(inp[1]));
  1012. return var1 + inp[1];
  1013. },
  1014. CompNode::load("gpu0")};
  1015. checker.run({TensorShape{3, 3}, {3, 1}});
  1016. }
  1017. TEST(TestJITFusionHalide, SameGradOpr) {
  1018. REQUIRE_GPU(1);
  1019. set_backend(Backend::HALIDE);
  1020. auto cn = CompNode::load("gpu0");
  1021. auto graph = ComputingGraph::make();
  1022. HostTensorGenerator<> gen;
  1023. auto host_x0 = gen({3, 3}, cn), host_x1 = gen({3, 1}, cn),
  1024. host_x2 = gen({3, 3}, cn);
  1025. auto a = opr::Host2DeviceCopy::make(*graph, host_x0),
  1026. b = opr::Host2DeviceCopy::make(*graph, host_x1),
  1027. c = opr::Host2DeviceCopy::make(*graph, host_x2);
  1028. auto y = (a + b) * c;
  1029. auto reduce_y = opr::reduce_sum(y * y, y.make_scalar(1));
  1030. auto a_grad = opr::VirtualGrad::make(reduce_y.node(), a.node());
  1031. auto b_grad = opr::VirtualGrad::make(reduce_y.node(), b.node());
  1032. auto c_grad = opr::VirtualGrad::make(reduce_y.node(), c.node());
  1033. gopt::GraphOptimizer gopt;
  1034. gopt.add_pass<gopt::JITFusionPass>(true);
  1035. gopt.add_pass<gopt::ExpandVirtualGradPass>();
  1036. VarNodeArray vars{y.node(), a_grad.node(), b_grad.node(), c_grad.node()};
  1037. gopt.apply_inplace(vars);
  1038. ASSERT_EQ(vars[1]->owner_opr()->input(0), vars[2]->owner_opr()->input(0));
  1039. ASSERT_NE(vars[1]->owner_opr()->input(0), vars[3]->owner_opr()->input(0));
  1040. }
  1041. template <typename tag>
  1042. class TestJITHalideFusionCuda : public ::testing::Test {};
  1043. TYPED_TEST_CASE(TestJITHalideFusionCuda, test_types);
  1044. TYPED_TEST(TestJITHalideFusionCuda, run) {
  1045. set_backend(Backend::NONE);
  1046. REQUIRE_GPU(1);
  1047. run<TypeParam>(Backend::HALIDE, CompNode::load("gpu0"));
  1048. set_backend(Backend::NONE);
  1049. }
  1050. #endif // MGB_JIT_HALIDE
  1051. template <typename tag>
  1052. class TestJITNvrtcFusion : public ::testing::Test {};
  1053. TYPED_TEST_CASE(TestJITNvrtcFusion, test_types);
  1054. TYPED_TEST(TestJITNvrtcFusion, run) {
  1055. set_backend(Backend::NONE);
  1056. REQUIRE_GPU(1);
  1057. run<TypeParam>(Backend::NVRTC, CompNode::load("gpu0"));
  1058. set_backend(Backend::NONE);
  1059. }
  1060. TEST(TestJITNvrtcFusion, SourceCache) {
  1061. REQUIRE_GPU(1);
  1062. set_backend(Backend::NVRTC);
  1063. std::string cache_cat;
  1064. std::vector<std::string> sources;
  1065. auto on_cache_get = [&](const std::string& category, const void* key,
  1066. size_t key_size, const void*, size_t) {
  1067. if (cache_cat.empty()) {
  1068. cache_cat = category;
  1069. } else {
  1070. ASSERT_EQ(cache_cat, category);
  1071. }
  1072. sources.push_back(std::string{static_cast<const char*>(key), key_size});
  1073. };
  1074. PersistentCacheHook cache_hook{on_cache_get};
  1075. auto cn = CompNode::load("gpu0");
  1076. auto run = [cn]() {
  1077. HostTensorGenerator<> gen;
  1078. auto host_x = gen({2, 3}, cn);
  1079. auto make_dst = [&](ComputingGraph& graph) {
  1080. auto x = opr::Host2DeviceCopy::make(graph, host_x),
  1081. y = jit_stop(x * opr::sin(x)), z = y + opr::tanh(y);
  1082. return z;
  1083. };
  1084. HostTensorND host_y1, host_y2;
  1085. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 2);
  1086. ASSERT_EQ(2u, find_oprs<JITExecutor>(*funcs.second).size());
  1087. funcs.first->execute();
  1088. funcs.second->execute();
  1089. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  1090. };
  1091. for (size_t i = 0; i < 4; ++i) {
  1092. run();
  1093. ASSERT_EQ((i + 1) * 2, sources.size());
  1094. ASSERT_EQ(sources[0], sources[i * 2]);
  1095. ASSERT_EQ(sources[1], sources[i * 2 + 1]);
  1096. }
  1097. }
  1098. TEST(TestJITNvrtc, DimshuffleFusion) {
  1099. REQUIRE_GPU(1);
  1100. set_backend(Backend::NVRTC);
  1101. auto cn = CompNode::load("gpu0");
  1102. HostTensorGenerator<> gen;
  1103. // single dimshuffle
  1104. {
  1105. auto host_x = gen({2, 3, 8, 8}, cn);
  1106. auto host_w = gen({3, 3, 1, 1}, cn);
  1107. auto make_dst = [&](ComputingGraph& graph) {
  1108. auto data = opr::SharedDeviceTensor::make(graph, *host_x);
  1109. auto w = opr::SharedDeviceTensor::make(graph, *host_w);
  1110. opr::Convolution::Param param;
  1111. auto x = opr::Convolution::make(data, w, param);
  1112. x = opr::relu(x);
  1113. x = opr::Dimshuffle::make(x, {1, 2, 3, 0});
  1114. x = opr::TypeCvt::make(x, dtype::Float16{});
  1115. return x;
  1116. };
  1117. HostTensorND host_y1, host_y2;
  1118. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 1);
  1119. ASSERT_EQ(1u, find_oprs<JITExecutor>(*funcs.second).size());
  1120. ASSERT_EQ(1u, find_oprs<opr::Convolution>(*funcs.second).size());
  1121. ASSERT_EQ(0u, find_oprs<opr::Elemwise>(*funcs.second).size());
  1122. ASSERT_EQ(0u, find_oprs<opr::Dimshuffle>(*funcs.second).size());
  1123. ASSERT_EQ(0u, find_oprs<opr::TypeCvt>(*funcs.second).size());
  1124. funcs.first->execute();
  1125. funcs.second->execute();
  1126. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  1127. }
  1128. // multi dimshuffle in one branch
  1129. {
  1130. auto host_x = gen({3, 4, 6}, cn);
  1131. auto make_dst = [&](ComputingGraph& graph) {
  1132. auto data = opr::SharedDeviceTensor::make(graph, *host_x);
  1133. auto x = opr::relu(data);
  1134. x = opr::Dimshuffle::make(x, {2, 0, 1});
  1135. x = opr::sigmoid(x);
  1136. x = opr::Dimshuffle::make(x, {1, 0, -1, 2});
  1137. x = opr::TypeCvt::make(x, dtype::Float16{});
  1138. return x;
  1139. };
  1140. HostTensorND host_y1, host_y2;
  1141. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 1);
  1142. ASSERT_EQ(1u, find_oprs<JITExecutor>(*funcs.second).size());
  1143. ASSERT_EQ(0u, find_oprs<opr::Elemwise>(*funcs.second).size());
  1144. ASSERT_EQ(0u, find_oprs<opr::Dimshuffle>(*funcs.second).size());
  1145. ASSERT_EQ(0u, find_oprs<opr::TypeCvt>(*funcs.second).size());
  1146. funcs.first->execute();
  1147. funcs.second->execute();
  1148. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  1149. }
  1150. // multi dimshuffle in two branch
  1151. {
  1152. auto host_x = gen({3, 4, 6}, cn);
  1153. auto make_dst = [&](ComputingGraph& graph) {
  1154. auto data = opr::SharedDeviceTensor::make(graph, *host_x);
  1155. auto x = opr::relu(data);
  1156. x = opr::Dimshuffle::make(x, {2, 0, 1});
  1157. x = opr::sigmoid(x);
  1158. x = opr::Dimshuffle::make(x, {1, 0, -1, 2});
  1159. x = opr::TypeCvt::make(x, dtype::Float16{});
  1160. auto y = opr::sigmoid(data);
  1161. y = opr::Dimshuffle::make(y, {0, 2, -1, 1});
  1162. y = opr::TypeCvt::make(y, dtype::Float16{});
  1163. auto z = x + y;
  1164. return z;
  1165. };
  1166. HostTensorND host_y1, host_y2;
  1167. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 1);
  1168. ASSERT_EQ(1u, find_oprs<JITExecutor>(*funcs.second).size());
  1169. ASSERT_EQ(0u, find_oprs<opr::Elemwise>(*funcs.second).size());
  1170. ASSERT_EQ(0u, find_oprs<opr::Dimshuffle>(*funcs.second).size());
  1171. ASSERT_EQ(0u, find_oprs<opr::TypeCvt>(*funcs.second).size());
  1172. funcs.first->execute();
  1173. funcs.second->execute();
  1174. MGB_ASSERT_TENSOR_NEAR(host_y1, host_y2, 1e-3);
  1175. }
  1176. // dimshuffle pattern length > 4
  1177. {
  1178. auto host_x = gen({4, 3, 4, 6}, cn);
  1179. auto make_dst = [&](ComputingGraph& graph) {
  1180. auto data = opr::SharedDeviceTensor::make(graph, *host_x);
  1181. auto x = opr::relu(data);
  1182. x = opr::Dimshuffle::make(x, {2, 1, 0, -1, 3});
  1183. x = opr::TypeCvt::make(x, dtype::Float16{});
  1184. return x;
  1185. };
  1186. HostTensorND host_y1, host_y2;
  1187. auto g0 = ComputingGraph::make();
  1188. g0->options().graph_opt_level = 0;
  1189. auto f0 = g0->compile({make_callback_copy(make_dst(*g0), host_y1)});
  1190. auto g1 = ComputingGraph::make();
  1191. g1->options().graph_opt_level = 3;
  1192. g1->options().graph_opt.jit = 1;
  1193. auto f1 = g1->compile({make_callback_copy(make_dst(*g1), host_y2)});
  1194. EXPECT_TRUE(find_oprs<JITExecutor>(*f1).empty());
  1195. f0->execute();
  1196. f1->execute();
  1197. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  1198. }
  1199. // dimshuffle is endpoint
  1200. {
  1201. auto host_x = gen({4, 3, 4, 6}, cn);
  1202. auto make_dst = [&](ComputingGraph& graph) {
  1203. auto x = opr::TypeCvt::make(
  1204. opr::Host2DeviceCopy::make(graph, host_x),
  1205. dtype::Float16{});
  1206. auto y = opr::Dimshuffle::make(x, {3, 0, 1, 2});
  1207. return y;
  1208. };
  1209. HostTensorND host_y;
  1210. auto g1 = ComputingGraph::make();
  1211. g1->options().graph_opt_level = 3;
  1212. g1->options().graph_opt.jit = 1;
  1213. auto f1 = g1->compile({make_callback_copy(make_dst(*g1), host_y)});
  1214. EXPECT_TRUE(find_oprs<JITExecutor>(*f1).empty());
  1215. }
  1216. }
  1217. TEST(TestJITNvrtc, DimshuffleGrad) {
  1218. REQUIRE_GPU(1);
  1219. set_backend(Backend::NVRTC);
  1220. auto cn = CompNode::load("gpu0");
  1221. HostTensorGenerator<> gen;
  1222. // single dimshuffle
  1223. {
  1224. auto host_x = gen({2, 3, 8, 8}, cn);
  1225. auto host_w = gen({3, 3, 1, 1}, cn);
  1226. auto make_dst = [&](ComputingGraph& graph) {
  1227. auto data = opr::SharedDeviceTensor::make(graph, *host_x);
  1228. auto w = opr::SharedDeviceTensor::make(graph, *host_w);
  1229. opr::Convolution::Param param;
  1230. auto x = opr::Convolution::make(data, w, param);
  1231. x = opr::relu(x);
  1232. x = opr::Dimshuffle::make(x, {1, 2, 3, 0});
  1233. x = opr::TypeCvt::make(x, dtype::Float16{});
  1234. auto loss = opr::reduce_sum(x, x.make_scalar(1));
  1235. auto grad = cg::grad(loss, w);
  1236. return grad;
  1237. };
  1238. HostTensorND host_y1, host_y2;
  1239. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 1);
  1240. ASSERT_EQ(1u, find_oprs<JITExecutor>(*funcs.second).size());
  1241. ASSERT_EQ(1u, find_oprs<opr::Convolution>(*funcs.second).size());
  1242. ASSERT_EQ(0u, find_oprs<opr::Elemwise>(*funcs.second).size());
  1243. ASSERT_EQ(0u, find_oprs<opr::Dimshuffle>(*funcs.second).size());
  1244. ASSERT_EQ(0u, find_oprs<opr::TypeCvt>(*funcs.second).size());
  1245. funcs.first->execute();
  1246. funcs.second->execute();
  1247. MGB_ASSERT_TENSOR_EQ(host_y1, host_y2);
  1248. }
  1249. // multi dimshuffle in two branch
  1250. {
  1251. auto host_x = gen({3, 4, 6}, cn);
  1252. auto make_dst = [&](ComputingGraph& graph) {
  1253. auto data = opr::SharedDeviceTensor::make(graph, *host_x);
  1254. auto x = opr::relu(data);
  1255. x = opr::Dimshuffle::make(x, {2, 0, 1});
  1256. x = opr::sigmoid(x);
  1257. x = opr::Dimshuffle::make(x, {1, 0, -1, 2});
  1258. x = opr::TypeCvt::make(x, dtype::Float16{});
  1259. auto y = opr::sigmoid(data);
  1260. y = opr::Dimshuffle::make(y, {0, 2, -1, 1});
  1261. y = opr::TypeCvt::make(y, dtype::Float16{});
  1262. auto z = x + y;
  1263. auto loss = opr::reduce_sum(z, z.make_scalar(1));
  1264. auto grad = cg::grad(loss, data);
  1265. return grad;
  1266. };
  1267. HostTensorND host_y1, host_y2;
  1268. auto funcs = make_func_pair(host_y1, host_y2, make_dst, 1);
  1269. ASSERT_EQ(1u, find_oprs<JITExecutor>(*funcs.second).size());
  1270. ASSERT_EQ(0u, find_oprs<opr::Elemwise>(*funcs.second).size());
  1271. ASSERT_EQ(0u, find_oprs<opr::Dimshuffle>(*funcs.second).size());
  1272. ASSERT_EQ(0u, find_oprs<opr::TypeCvt>(*funcs.second).size());
  1273. funcs.first->execute();
  1274. funcs.second->execute();
  1275. MGB_ASSERT_TENSOR_NEAR(host_y1, host_y2, 1e-3);
  1276. }
  1277. {
  1278. FusionChecker checker{2,
  1279. [](const SymbolVarArray& inp) -> SymbolVar {
  1280. auto var = opr::Dimshuffle::make(inp[0], {1, 2, 3, 0});
  1281. return inp[1] * var;
  1282. },
  1283. CompNode::load("gpu0")};
  1284. checker.set_jit_level(1)
  1285. .run({TensorShape{1, 2, 3, 4}, {2, 3, 4, 1}});
  1286. }
  1287. }
  1288. #endif // MGB_JIT
  1289. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台