You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

others.cpp 28 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759
  1. /**
  2. * \file src/opr/test/basic_arith/others.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/test/helper.h"
  12. #include "megbrain/test/autocheck.h"
  13. #include "megbrain/test/host_static_calc.h"
  14. #include "megbrain/opr/basic_arith_wrapper.h"
  15. #include "megbrain/opr/blas.h"
  16. #include "megbrain/opr/io.h"
  17. #include "megbrain/opr/utility.h"
  18. #include "megbrain/opr/tensor_manip.h"
  19. #include "megbrain/utils/timer.h"
  20. #include "megdnn/tensor_iter.h"
  21. #include <cmath>
  22. using namespace mgb;
  23. TEST(TestOprBasicArith, AddUpdate) {
  24. constexpr size_t SIZE = 123456;
  25. opr::AddUpdate::Param param{2, -1, 0.5f};
  26. HostTensorGenerator<> gen;
  27. auto host_x = gen({SIZE}), host_y = gen({SIZE});
  28. auto dev_x = std::make_shared<DeviceTensorND>(CompNode::load("xpu0"));
  29. dev_x->copy_from(*host_x);
  30. auto graph = ComputingGraph::make();
  31. SymbolVar dev_x_shared = opr::SharedDeviceTensor::make(
  32. *graph, dev_x, {"x"}),
  33. dev_y = opr::Host2DeviceCopy::make(*graph, host_y, {"y"}),
  34. dev_x_updated = opr::AddUpdate::make(dev_x_shared, dev_y, param);
  35. auto func = graph->compile({{
  36. dev_x_updated, [&](DeviceTensorND &){}}});
  37. func->execute();
  38. ASSERT_EQ(dev_x->raw_ptr(), dev_x_updated.node()->prev_dev_ptr());
  39. func->to_json()->writeto_fpath(output_file("add_update_graph.json"));
  40. HostTensorND get{CompNode::load("xpu0")};
  41. get.copy_from(*dev_x).sync();
  42. ASSERT_TRUE(get.layout().eq_layout(host_x->layout()));
  43. auto x = host_x->ptr<float>(), y = host_y->ptr<float>(),
  44. z = get.ptr<float>();
  45. for (size_t i = 0; i < SIZE; i ++) {
  46. auto expect = x[i] * param.alpha->get_cast<float>() +
  47. y[i] * param.beta->get_cast<float>() +
  48. param.bias->get_cast<float>();
  49. MGB_ASSERT_FLOAT_EQ(expect, z[i]);
  50. }
  51. }
  52. TEST(TestOprBasicArith, AddUpdateInt) {
  53. constexpr size_t SIZE = 123;
  54. opr::AddUpdate::Param param{2, -1, 3};
  55. HostTensorGenerator<dtype::Int32> gen;
  56. auto host_x = gen({SIZE}), host_y = gen({SIZE});
  57. auto dev_x = std::make_shared<DeviceTensorND>(CompNode::load("xpu0"));
  58. dev_x->copy_from(*host_x);
  59. auto graph = ComputingGraph::make();
  60. SymbolVar dev_x_shared = opr::SharedDeviceTensor::make(
  61. *graph, dev_x, {"x"}),
  62. dev_y = opr::Host2DeviceCopy::make(*graph, host_y, {"y"}),
  63. dev_x_updated = opr::AddUpdate::make(dev_x_shared, dev_y, param);
  64. auto func = graph->compile({{
  65. dev_x_updated, [&](DeviceTensorND &){}}});
  66. func->execute();
  67. ASSERT_EQ(dev_x->raw_ptr(), dev_x_updated.node()->prev_dev_ptr());
  68. HostTensorND get{CompNode::load("xpu0")};
  69. get.copy_from(*dev_x).sync();
  70. ASSERT_TRUE(get.layout().eq_layout(host_x->layout()));
  71. auto x = host_x->ptr<int>(), y = host_y->ptr<int>(),
  72. z = get.ptr<int>();
  73. for (size_t i = 0; i < SIZE; i ++) {
  74. auto expect = x[i] * param.alpha->get_cast<int>() +
  75. y[i] * param.beta->get_cast<int>() +
  76. param.bias->get_cast<int>();
  77. ASSERT_EQ(expect, z[i]) << ssprintf("i=%zu x=%d y=%d", i, x[i], y[i]);
  78. }
  79. ASSERT_NO_THROW(func->execute());
  80. param.bias->set(2.3f);
  81. ASSERT_THROW(func->execute(), MegDNNError);
  82. }
  83. TEST(TestOprBasicArith, DynAddUpdate) {
  84. constexpr size_t SIZE = 10;
  85. HostTensorGenerator<> gen;
  86. auto host_x = gen({SIZE}), host_y = gen({SIZE});
  87. auto dev_x = std::make_shared<DeviceTensorND>(CompNode::load("xpu0"));
  88. dev_x->copy_from(*host_x);
  89. auto graph = ComputingGraph::make();
  90. auto x = opr::SharedDeviceTensor::make(*graph, dev_x, {"x"}),
  91. y = opr::MarkDynamicVar::make(opr::Host2DeviceCopy::make(*graph,
  92. host_y, {"y"})),
  93. x_updated = opr::AddUpdate::make(x, y, {});
  94. ASSERT_FALSE(cg::is_static_var_shape(y.node()));
  95. ASSERT_TRUE(cg::is_static_var_shape(x_updated.node()));
  96. auto func = graph->compile({{x_updated, [&](DeviceTensorND &){}}});
  97. func->execute();
  98. HostTensorND host_xu;
  99. host_xu.copy_from(*dev_x).sync();
  100. ASSERT_TRUE(host_xu.layout().eq_layout(host_x->layout()));
  101. {
  102. auto x = host_x->ptr<float>(), y = host_y->ptr<float>(),
  103. z = host_xu.ptr<float>();
  104. for (size_t i = 0; i < SIZE; i ++) {
  105. MGB_ASSERT_FLOAT_EQ(x[i] + y[i], z[i]);
  106. }
  107. }
  108. }
  109. TEST(TestOprBasicArith, AddUpdateBroadcast) {
  110. constexpr size_t SIZE = 123456;
  111. opr::AddUpdate::Param param{-1.2f, 2.1f, -4};
  112. HostTensorGenerator<> gen;
  113. auto host_x = gen({SIZE});
  114. auto dev_x = std::make_shared<DeviceTensorND>(CompNode::load("xpu0"));
  115. dev_x->copy_from(*host_x);
  116. auto graph = ComputingGraph::make();
  117. SymbolVar x = opr::SharedDeviceTensor::make(*graph, dev_x, {"x"}),
  118. delta = opr::Subtensor::make(x,
  119. {opr::Subtensor::AxisIndexer::make_index(0,
  120. x.make_scalar(3))}),
  121. x_updated = opr::AddUpdate::make(x, delta, param);
  122. auto func = graph->compile({{x_updated, {}}});
  123. func->execute();
  124. HostTensorND get{CompNode::load("xpu0")};
  125. get.copy_from(*dev_x).sync();
  126. ASSERT_TRUE(get.layout().eq_layout(host_x->layout()));
  127. auto xp = host_x->ptr<float>(), z = get.ptr<float>();
  128. for (size_t i = 0; i < SIZE; ++ i) {
  129. auto expect = xp[i] * param.alpha->get_cast<float>() +
  130. xp[3] * param.beta->get_cast<float>() +
  131. param.bias->get_cast<float>();
  132. MGB_ASSERT_FLOAT_EQ(expect, z[i]);
  133. }
  134. }
  135. TEST(TestOprBasicArith, AddUpdateNan) {
  136. constexpr size_t SIZE = 23;
  137. HostTensorGenerator<> gen;
  138. auto host_x = gen({SIZE}),
  139. host_src = gen({1});
  140. host_x->ptr<float>()[0] = NAN;
  141. auto dev_x = std::make_shared<DeviceTensorND>(CompNode::load("xpu0"));
  142. dev_x->copy_from(*host_x);
  143. auto graph = ComputingGraph::make();
  144. SymbolVar x = opr::SharedDeviceTensor::make(*graph, dev_x, {"x"}),
  145. dest = opr::Host2DeviceCopy::make(*graph, host_src),
  146. xu = opr::AddUpdate::make(x, dest, {0.f, 1});
  147. auto func = graph->compile({{xu, {}}});
  148. func->execute();
  149. HostTensorND host_y;
  150. host_y.copy_from(*dev_x).sync();
  151. for (size_t i = 0; i < SIZE; ++ i)
  152. MGB_ASSERT_FLOAT_EQ(host_src->ptr<float>()[0], host_y.ptr<float>()[i]);
  153. }
  154. TEST(TestOprBasicArith, AddInplace) {
  155. constexpr size_t SIZE = 102400;
  156. HostTensorGenerator<> gen;
  157. auto host_opr0 = gen({SIZE}), host_opr1 = gen({SIZE}),
  158. host_opr2 = gen({SIZE});
  159. // for operations with commutable input, must check both input order:
  160. // opr1 + opr0, opr1 + opr2
  161. auto graph = ComputingGraph::make();
  162. auto opr0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_opr0, {"opr0"}),
  163. opr1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_opr1, {"opr1"}),
  164. opr2 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_opr2, {"opr2"}),
  165. sum0 = opr::add(opr1, opr0).rename("sum0"),
  166. sum1 = opr::add(opr1, opr2).rename("sum1"),
  167. sum2 = opr::add(opr2, opr0).rename("sum2");
  168. // check dedup
  169. ASSERT_EQ(sum0.node(), (opr0 + opr1).node());
  170. HostTensorND host_sum0, host_sum1;
  171. auto func = graph->compile({make_callback_copy(sum0, host_sum0),
  172. make_callback_copy(sum1, host_sum1)});
  173. func->execute();
  174. EXPECT_TRUE(dev_ptr(sum0) == dev_ptr(opr1) ||
  175. dev_ptr(sum0) == dev_ptr(opr0));
  176. EXPECT_TRUE(dev_ptr(sum1) == dev_ptr(opr1) ||
  177. dev_ptr(sum1) == dev_ptr(opr2));
  178. func->to_json()->writeto_fpath(output_file("TestAddInplaceFunc0.json"));
  179. ASSERT_TRUE(host_sum0.layout().eq_layout(host_opr0->layout()));
  180. ASSERT_TRUE(host_sum1.layout().eq_layout(host_opr0->layout()));
  181. auto o0 = host_opr0->ptr<float>(), o1 = host_opr1->ptr<float>(),
  182. o2 = host_opr2->ptr<float>(),
  183. s0 = host_sum0.sync().ptr<float>(), s1 = host_sum1.sync().ptr<float>();
  184. for (size_t i = 0; i < SIZE; i ++) {
  185. MGB_ASSERT_FLOAT_EQ(o1[i] + o0[i], s0[i]) <<
  186. ssprintf("failed opr1(%.5f)+opr0(%.5f) at %zd", o1[i], o0[i], i);
  187. MGB_ASSERT_FLOAT_EQ(o1[i] + o2[i], s1[i]) <<
  188. ssprintf("failed opr1(%.5f)+opr2(%.5f) at %zd", o1[i], o2[i], i);
  189. }
  190. *host_opr0 = *gen({SIZE});
  191. *host_opr1 = *gen({SIZE});
  192. *host_opr2 = *gen({SIZE});
  193. HostTensorND host_sum2;
  194. func = graph->compile({make_callback_copy(sum0, host_sum0),
  195. make_callback_copy(sum1, host_sum1),
  196. make_callback_copy(sum2, host_sum2)});
  197. func->execute();
  198. func->to_json()->writeto_fpath(output_file("TestAddInplaceFunc1.json"));
  199. ASSERT_TRUE(host_sum0.layout().eq_layout(host_opr0->layout()));
  200. ASSERT_TRUE(host_sum1.layout().eq_layout(host_opr0->layout()));
  201. ASSERT_TRUE(host_sum2.layout().eq_layout(host_opr0->layout()));
  202. o0 = host_opr0->ptr<float>(); o1 = host_opr1->ptr<float>();
  203. o2 = host_opr2->ptr<float>();
  204. s0 = host_sum0.ptr<float>(); s1 = host_sum1.ptr<float>();
  205. auto s2 = host_sum2.sync().ptr<float>();
  206. for (size_t i = 0; i < SIZE; i ++) {
  207. MGB_ASSERT_FLOAT_EQ(o1[i] + o0[i], s0[i]) <<
  208. ssprintf("failed opr1(%.5f)+opr0(%.5f) at %zd", o1[i], o0[i], i);
  209. MGB_ASSERT_FLOAT_EQ(o1[i] + o2[i], s1[i]) <<
  210. ssprintf("failed opr1(%.5f)+opr2(%.5f) at %zd", o1[i], o2[i], i);
  211. MGB_ASSERT_FLOAT_EQ(o2[i] + o0[i], s2[i]) <<
  212. ssprintf("failed opr2(%.5f)+opr0(%.5f) at %zd", o2[i], o0[i], i);
  213. }
  214. }
  215. TEST(TestOprBasicArith, AddUpdateOtherStream) {
  216. REQUIRE_GPU(1);
  217. constexpr size_t SIZE = 60;
  218. HostTensorGenerator<> gen;
  219. auto graph = ComputingGraph::make();
  220. std::atomic_bool flag{false};
  221. auto set_flag = [&flag](DeviceTensorND&) {
  222. flag = true;
  223. };
  224. auto wait_flag = [&flag](DeviceTensorND&) {
  225. while (!flag) {
  226. using namespace std::literals;
  227. std::this_thread::sleep_for(0.2s);
  228. }
  229. };
  230. std::shared_ptr<HostTensorND> host_val = gen({SIZE});
  231. auto cn1 = CompNode::load("gpu0:0").change_stream(1);
  232. auto param = opr::SharedDeviceTensor::make(*graph, *host_val);
  233. param.node()->owner_opr()->node_prop().attribute().priority =
  234. std::numeric_limits<int>::max();
  235. auto copy = opr::Copy::make(param, cn1);
  236. auto add = (copy + 3) * 5;
  237. auto add_update = opr::AddUpdate::make(param, add, {}, {cn1});
  238. auto callback = opr::CallbackInjector::make(add_update, set_flag);
  239. auto waiter = opr::CallbackInjector::make(
  240. opr::SharedDeviceTensor::make(*graph, *host_val),
  241. wait_flag);
  242. HostTensorND host_out0;
  243. HostTensorND host_out1;
  244. auto func = graph->compile({make_callback_copy(callback, host_out0),
  245. make_callback_copy(waiter, host_out1)});
  246. func->execute();
  247. }
  248. TEST(TestOprBasicArith, DisableAddUpdate) {
  249. constexpr size_t SIZE = 10;
  250. opr::AddUpdate::Param param{2, -1, 0.5f, 1};
  251. HostTensorGenerator<> gen;
  252. auto host_x = gen({SIZE}), host_y = gen({SIZE});
  253. auto dev_x = std::make_shared<DeviceTensorND>(CompNode::load("xpu0"));
  254. dev_x->copy_from(*host_x);
  255. auto graph = ComputingGraph::make();
  256. SymbolVar dev_x_shared = opr::SharedDeviceTensor::make(
  257. *graph, dev_x, {"x"}),
  258. dev_y = opr::Host2DeviceCopy::make(*graph, host_y, {"y"}),
  259. dev_x_updated = opr::AddUpdate::make(dev_x_shared, dev_y, param);
  260. auto func = graph->compile({{
  261. dev_x_updated, [&](DeviceTensorND &){}}});
  262. func->execute();
  263. ASSERT_EQ(dev_x->raw_ptr(), dev_x_updated.node()->prev_dev_ptr());
  264. func->to_json()->writeto_fpath(output_file("add_update_graph.json"));
  265. HostTensorND get{CompNode::load("xpu0")};
  266. get.copy_from(*dev_x).sync();
  267. ASSERT_TRUE(get.layout().eq_layout(host_x->layout()));
  268. auto x = host_x->ptr<float>(), y = get.ptr<float>();
  269. for (size_t i = 0; i < SIZE; i ++) {
  270. MGB_ASSERT_FLOAT_EQ(x[i], y[i]);
  271. }
  272. }
  273. TEST(TestOprBasicArith, AddUpdateVolatile) {
  274. constexpr int SIZE = 12222;
  275. opr::AddUpdate::Param param{2, -1, 0.5f};
  276. HostTensorGenerator<> gen;
  277. auto cn = CompNode::load("xpu0");
  278. for (auto dynamic_alloc : {false, true}) {
  279. // test on both static and dynamic allocation
  280. auto host_x = gen({SIZE << 1}), host_y = gen({SIZE << 1});
  281. auto dev_x = std::make_shared<DeviceTensorND>(cn);
  282. DeviceTensorND dev_x0, dev_x1;
  283. HostTensorND host_sub;
  284. dev_x0.copy_from(*host_x).sync();
  285. dev_x1.copy_from(*host_x).sync();
  286. *dev_x = dev_x0;
  287. auto graph = ComputingGraph::make();
  288. graph->options().force_dynamic_alloc = dynamic_alloc;
  289. SymbolVar dev_x_shared = opr::VolatileSharedDeviceTensor::make(
  290. *graph, dev_x, {"x"}),
  291. dev_y = opr::Host2DeviceCopy::make(*graph, host_y, {"y"}),
  292. dev_x_updated = opr::AddUpdate::make(dev_x_shared, dev_y, param),
  293. // check read-only forward on force updated var
  294. dev_x_updated_sub = opr::Subtensor::make(dev_x_updated, {
  295. opr::Subtensor::AxisIndexer::make_interval(-1, None, None,
  296. dev_x_shared.make_scalar(SIZE >> 1))});
  297. auto func = graph->compile({
  298. {dev_x_updated, [&](DeviceTensorND &){}},
  299. {make_callback_copy(dev_x_updated_sub, host_sub)}});
  300. auto run = [&] {
  301. HostTensorND origin_x{cn}, get{cn};
  302. origin_x.copy_from(*dev_x).sync();
  303. func->execute().wait();
  304. ASSERT_EQ(dev_x->raw_ptr(), dev_x_updated.node()->prev_dev_ptr());
  305. ASSERT_EQ(dev_x->raw_ptr(), dev_x_updated_sub.node()->prev_dev_ptr());
  306. get.copy_from(*dev_x).sync();
  307. ASSERT_TRUE(get.layout().eq_layout(origin_x.layout()));
  308. mgb_assert(origin_x.layout().is_contiguous() &&
  309. get.layout().is_contiguous() &&
  310. host_y->layout().is_contiguous());
  311. auto x = origin_x.ptr<float>(), y = host_y->ptr<float>(),
  312. z = get.ptr<float>();
  313. bool bcast = dev_x->shape().ndim > 1;
  314. auto expect = [&](size_t i) {
  315. return x[i] * param.alpha->get_cast<float>() +
  316. (bcast ? y[i / SIZE] : y[i]) *
  317. param.beta->get_cast<float>() +
  318. param.bias->get_cast<float>();
  319. };
  320. for (size_t i = 0; i < SIZE * 2; i ++) {
  321. MGB_ASSERT_FLOAT_EQ(expect(i), z[i]);
  322. }
  323. mgb_assert(host_sub.shape().total_nr_elems() == 4 &&
  324. host_sub.layout().is_contiguous());
  325. for (size_t i = 0; i < 4; ++ i) {
  326. size_t idx = i * (SIZE >> 1);
  327. MGB_ASSERT_FLOAT_EQ(expect(idx), host_sub.ptr<float>()[i]);
  328. }
  329. };
  330. run();
  331. run();
  332. *dev_x = dev_x1; // ptr change
  333. run();
  334. host_x = gen({2, SIZE});
  335. host_y->copy_from(*gen({2, 1})).sync();
  336. dev_x->copy_from(*host_x).sync(); // shape change
  337. run();
  338. }
  339. }
  340. // AddUpdate in gradient path but no gradient flows through it
  341. TEST(TestOprBasicArith, AddUpdateInGradPath) {
  342. auto graph = ComputingGraph::make();
  343. HostTensorGenerator<> gen;
  344. auto dest = opr::SharedDeviceTensor::make(*graph, *gen({42}));
  345. auto host_x = gen({42});
  346. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  347. // delta depends on x, but not differentiable wrt x
  348. // a invalid grad is registered for AddUpdate to fix this case
  349. auto delta = opr::VirtualDep::make({opr::SetGrad::make(x, nullptr), x});
  350. auto updated = opr::AddUpdate::make(dest, delta);
  351. auto y = opr::reduce_ax_sum(updated + x, 0);
  352. auto dx = cg::grad(y, x);
  353. HostTensorND host_dx;
  354. auto func = graph->compile({make_callback_copy(dx, host_dx)});
  355. func->execute();
  356. for (size_t i = 0; i < host_dx.shape(0); ++i) {
  357. MGB_ASSERT_FLOAT_EQ(host_dx.ptr<float>()[i], 1.f);
  358. }
  359. }
  360. TEST(TestOprBasicArith, MemFwd) {
  361. constexpr size_t SIZE = 12321;
  362. HostTensorGenerator<> gen;
  363. auto host_x = gen({SIZE});
  364. auto graph = ComputingGraph::make();
  365. auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
  366. y = opr::sin(x),
  367. z = y + 1;
  368. HostTensorND host_z;
  369. auto func = graph->compile({make_callback_copy(z, host_z)});
  370. func->execute();
  371. ASSERT_EQ(dev_ptr(x), dev_ptr(y));
  372. ASSERT_EQ(dev_ptr(x), dev_ptr(z));
  373. for (size_t i = 0; i < SIZE; ++ i) {
  374. MGB_ASSERT_FLOAT_EQ(host_z.ptr<float>()[i],
  375. std::sin(host_x->ptr<float>()[i]) + 1.f);
  376. };
  377. }
  378. TEST(TestOprBasicArith, BinaryGradWithBroadcast) {
  379. using Checker = AutoOprChecker<3, 1>;
  380. auto make_graph = [](const Checker::SymInpArray &inputs) ->
  381. Checker::SymOutArray {
  382. return {inputs[0] + (opr::MarkDynamicVar::make(inputs[1]) + inputs[2])};
  383. };
  384. auto fwd = [](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  385. host_add(dest[0], *inp[0], *inp[1]);
  386. host_add(dest[0], dest[0], *inp[2]);
  387. };
  388. Checker(make_graph, fwd).
  389. run({TensorShape{2, 3}, TensorShape{2, 3}, TensorShape{1}}).
  390. run({TensorShape{1, 5}, TensorShape{1, 1}, TensorShape{5, 1}}).
  391. run({TensorShape{2, 1, 1}, TensorShape{1, 3, 1}, TensorShape{1, 1, 4}}).
  392. run({TensorShape{1, 1, 1}, TensorShape{1, 3, 1}, TensorShape{2, 3, 4}});
  393. }
  394. TEST(TestOprBasicArith, BinaryBroadcastCorrectness) {
  395. using Checker = AutoOprChecker<2, 1>;
  396. auto run = [&](bool dyn_inp) {
  397. auto make_graph = [&](const Checker::SymInpArray &inputs) ->
  398. Checker::SymOutArray {
  399. auto x = inputs[0], y = inputs[1];
  400. if (dyn_inp) {
  401. x = opr::MarkDynamicVar::make(x);
  402. y = opr::MarkDynamicVar::make(y);
  403. }
  404. x.rename("x");
  405. y.rename("y");
  406. return {x * y};
  407. };
  408. auto fwd = [](Checker::NumOutArray &dest, Checker::NumInpArray inp) {
  409. TensorShape oshp;
  410. megdnn::Elemwise::deduce_shape({inp[0]->shape(), inp[1]->shape()},
  411. oshp);
  412. auto &&dv = dest[0].comp_node(inp[0]->comp_node()).resize(oshp);
  413. auto &&iv0 = inp[0]->sub(SubTensorSpec::make_from_layout(
  414. inp[0]->layout().broadcast(oshp))),
  415. &&iv1 = inp[1]->sub(SubTensorSpec::make_from_layout(
  416. inp[1]->layout().broadcast(oshp)));
  417. auto it0 = megdnn::tensor_iter_valonly<float>(
  418. iv0.as_megdnn()).begin(),
  419. it1 = megdnn::tensor_iter_valonly<float>(
  420. iv1.as_megdnn()).begin();
  421. for (size_t i = 0, it = oshp.total_nr_elems(); i < it; ++ i) {
  422. dv.ptr<float>()[i] = *it0 * *it1;
  423. ++ it0;
  424. ++ it1;
  425. }
  426. };
  427. Checker::RunOptions opt;
  428. opt.numdiff_eps = 1;
  429. Checker(make_graph, fwd).
  430. run({TensorShape{5, 3}, {5, 3}}, opt).
  431. run({TensorShape{2, 2, 1, 1}, {1, 2, 1, 1}}, opt).
  432. run({TensorShape{1, 2}, {2, 1}}, opt).
  433. run({TensorShape{3, 2, 5}, {1}}, opt).
  434. run({TensorShape{4, 5, 1, 1}, {4, 5, 6, 7}}, opt).
  435. run({TensorShape{8, 4, 1, 1}, {1, 4, 5, 1}}, opt);
  436. };
  437. run(false);
  438. run(true);
  439. }
  440. TEST(TestOprBasicArith, Optimize) {
  441. auto graph = ComputingGraph::make();
  442. HostTensorGenerator<> gen;
  443. auto host_x = gen({23});
  444. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  445. x_sum2 = opr::reduce_sum(
  446. opr::pow(x, x.make_scalar(2)), x.make_scalar(1));
  447. ASSERT_EQ(opr::Reduce::Mode::SUM_SQR,
  448. x_sum2.node()->owner_opr()->cast_final_safe<opr::Reduce>().
  449. param().mode);
  450. float sum2 = 0;
  451. auto xptr = host_x->ptr<float>();
  452. for (size_t i = 0, it = host_x->shape().total_nr_elems(); i < it; ++ i) {
  453. sum2 += xptr[i] * xptr[i];
  454. }
  455. HostTensorND host_x_sum2;
  456. auto func = graph->compile({make_callback_copy(x_sum2, host_x_sum2)});
  457. func->execute();
  458. ASSERT_EQ(TensorShape{1}, host_x_sum2.shape());
  459. MGB_ASSERT_FLOAT_EQ(sum2, host_x_sum2.ptr<float>()[0]);
  460. }
  461. TEST(TestOprBasicArith, TypeCvt) {
  462. auto graph = ComputingGraph::make();
  463. HostTensorGenerator<> gen{0, 1000};
  464. auto host_x = gen({23});
  465. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  466. y = opr::TypeCvt::make(x, dtype::Int32{});
  467. HostTensorND host_y;
  468. auto func = graph->compile({make_callback_copy(y, host_y)});
  469. func->execute();
  470. auto px = host_x->ptr<float>();
  471. auto py = host_y.ptr<int>();
  472. for (size_t i = 0; i < 23; ++i) {
  473. ASSERT_EQ(static_cast<int>(px[i]), py[i]);
  474. }
  475. host_x->resize({3, 0});
  476. func->execute();
  477. ASSERT_EQ(TensorShape({3, 0}), host_y.shape());
  478. }
  479. TEST(TestOprBasicArith, TypeCvtBool) {
  480. auto graph = ComputingGraph::make();
  481. HostTensorGenerator<dtype::Int32> gen;
  482. auto host_x = gen({3});
  483. auto px = host_x->ptr<int>();
  484. px[0] = -1;
  485. px[1] = 0;
  486. px[2] = 1;
  487. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  488. y = opr::TypeCvt::make(x, dtype::Bool{});
  489. HostTensorND host_y;
  490. auto func = graph->compile({make_callback_copy(y, host_y)});
  491. func->execute();
  492. auto py = host_y.ptr<bool>();
  493. for (size_t i = 0;i < 3;i ++) {
  494. ASSERT_EQ(static_cast<bool>(px[i]), py[i]);
  495. }
  496. ASSERT_EQ(TensorShape({3}), host_y.shape());
  497. }
  498. TEST(TestOprBasicArith, TypeCvtFromBool) {
  499. auto graph = ComputingGraph::make();
  500. HostTensorGenerator<dtype::Bool> gen;
  501. auto host_x = gen({2});
  502. auto px = host_x->ptr<bool>();
  503. px[0] = true;
  504. px[1] = false;
  505. auto x = opr::Host2DeviceCopy::make(*graph, host_x),
  506. y = opr::TypeCvt::make(x, dtype::Int32{});
  507. HostTensorND host_y;
  508. auto func = graph->compile({make_callback_copy(y, host_y)});
  509. func->execute();
  510. auto py = host_y.ptr<int>();
  511. for (size_t i = 0;i < 2;i ++) {
  512. ASSERT_EQ(static_cast<int>(px[i]), py[i]);
  513. }
  514. ASSERT_EQ(TensorShape({2}), host_y.shape());
  515. }
  516. TEST(TestOprBasicArith, ElemwiseMemFwd) {
  517. auto graph = ComputingGraph::make();
  518. graph->options().graph_opt_level = 0;
  519. HostTensorGenerator<> gen;
  520. auto host_x = gen({3, 3}),
  521. host_y = gen({3, 3});
  522. // x[:, ::-1]
  523. auto rev = [](SymbolVar x) {
  524. return opr::Subtensor::make(x,
  525. {opr::Subtensor::AxisIndexer::make_interval(
  526. 1, None, None, x.make_scalar(-1))});
  527. };
  528. auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
  529. y = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y),
  530. y0 = rev(y),
  531. y1 = rev(x),
  532. z0 = x + y0,
  533. z1 = x + y1,
  534. z2 = x + x;
  535. auto check = [&graph, &host_x, x](SymbolVar y, SymbolVar z, float* py,
  536. bool rev_y, bool should_fwd) {
  537. HostTensorND host_z;
  538. auto func = graph->compile({make_callback_copy(z, host_z)});
  539. func->execute();
  540. HostTensorND expect;
  541. expect.copy_from(*host_x);
  542. auto pe = expect.ptr<float>();
  543. for (size_t i = 0; i < 3; ++i) {
  544. auto cur_py = py + i * 3 + static_cast<int>(rev_y) * 2;
  545. for (size_t j = 0; j < 3; ++j) {
  546. pe[i * 3 + j] += *cur_py;
  547. cur_py += rev_y ? -1 : 1;
  548. }
  549. }
  550. MGB_ASSERT_TENSOR_EQ(expect, host_z);
  551. auto xptr = dev_ptr(x), yptr = dev_ptr(y), zptr = dev_ptr(z);
  552. if (should_fwd) {
  553. ASSERT_EQ(zptr, xptr);
  554. } else {
  555. ASSERT_NE(zptr, xptr);
  556. ASSERT_NE(zptr, yptr);
  557. }
  558. };
  559. check(y0, z0, host_y->ptr<float>(), true, true);
  560. ASSERT_EQ(dev_ptr(y) + 2 * sizeof(float), dev_ptr(y0));
  561. check(y1, z1, host_x->ptr<float>(), true, false);
  562. ASSERT_EQ(dev_ptr(x) + 2 * sizeof(float), dev_ptr(y1));
  563. check(x, z2, host_x->ptr<float>(), false, true);
  564. }
  565. TEST(TestOprBasicArith, ElemwiseRequireContig) {
  566. auto graph = ComputingGraph::make();
  567. graph->options().graph_opt_level = 0;
  568. HostTensorGenerator<> gen;
  569. auto host_x = gen({3, 3}), host_y = gen({1, 3});
  570. auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
  571. y = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y),
  572. xt = opr::Dimshuffle::make(x, {1, 0}),
  573. yb = y.broadcast({3, 3}),
  574. z = xt + yb;
  575. HostTensorND host_z;
  576. auto func = graph->compile({make_callback_copy(z, host_z)});
  577. func->execute();
  578. HostTensorND expect{host_x->comp_node(), host_x->dtype()};
  579. expect.resize({3, 3});
  580. auto px = host_x->ptr<float>(), py = host_y->ptr<float>(),
  581. pe = expect.ptr<float>();
  582. for (size_t i = 0; i < 3; ++i) {
  583. for (size_t j = 0; j < 3; ++j) {
  584. pe[i * 3 + j] = px[j * 3 + i] + py[j];
  585. }
  586. }
  587. MGB_ASSERT_TENSOR_EQ(expect, host_z);
  588. ASSERT_NE(dev_ptr(x), dev_ptr(xt));
  589. ASSERT_EQ(dev_ptr(y), dev_ptr(yb));
  590. ASSERT_EQ(dev_ptr(xt), dev_ptr(z));
  591. }
  592. TEST(TestOprBasicArith, TypeCvtDedup) {
  593. HostTensorGenerator<> gen;
  594. auto host_x = gen({5, 5, 5, 5});
  595. auto graph = ComputingGraph::make();
  596. auto x = opr::Host2DeviceCopy::make(*graph, host_x);
  597. dtype::Quantized8Asymm dtype1(0.01f, (uint8_t) 123);
  598. dtype::Quantized8Asymm dtype2(0.02f, (uint8_t) 234);
  599. auto cvt1 = opr::TypeCvt::make(x, dtype1);
  600. auto cvt2 = opr::TypeCvt::make(x, dtype2);
  601. ASSERT_NE(cvt1.node(), cvt2.node());
  602. dtype::Quantized8Asymm dtype3(0.01f, (uint8_t) 123);
  603. auto cvt3 = opr::TypeCvt::make(x, dtype3);
  604. ASSERT_EQ(cvt1.node(), cvt3.node());
  605. }
  606. TEST(TestOprBasicArith, PowC) {
  607. using Checker = AutoOprChecker<1, 1>;
  608. SymbolVar inp, sub;
  609. auto make_graph =
  610. [&](const Checker::SymInpArray& inputs) -> Checker::SymOutArray {
  611. // test non-contig
  612. inp = inputs[0];
  613. sub = opr::Subtensor::make(
  614. inp, {opr::Subtensor::AxisIndexer::make_interval(
  615. 1, None, inputs[0].make_scalar(-2), None)});
  616. return {opr::PowC::make(sub, 2.f)};
  617. };
  618. auto fwd = [](Checker::NumOutArray& dest, Checker::NumInpArray inp) {
  619. TensorShape oshp = inp[0]->shape();
  620. oshp[1] -= 2;
  621. size_t size_x = oshp[0],
  622. strd_x = inp[0]->shape().total_nr_elems() / size_x,
  623. size_y = oshp.total_nr_elems() / size_x;
  624. auto px = inp[0]->ptr<float>(), py = dest[0].resize(oshp).ptr<float>();
  625. for (size_t i = 0; i < size_x; ++i) {
  626. for (size_t j = 0; j < size_y; ++j) {
  627. float xv = px[i * strd_x + j], yv = xv * xv;
  628. py[i * size_y + j] = yv;
  629. }
  630. }
  631. };
  632. Checker checker{make_graph, fwd};
  633. checker.run({TensorShape{2, 3}})
  634. .run({TensorShape{12, 33}})
  635. .run({TensorShape{5, 33, 7}});
  636. ASSERT_EQ(prev_dev_ptr(inp), prev_dev_ptr(sub));
  637. }
  638. TEST(TestOprBasicArith, PowCInfer) {
  639. HostTensorGenerator<> gen;
  640. auto run = [&](bool contig) {
  641. auto host_x = gen({3, contig ? 4u : 5u});
  642. auto graph = ComputingGraph::make();
  643. auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
  644. xsub = opr::Subtensor::make(
  645. x, {opr::Subtensor::AxisIndexer::make_interval(
  646. 1, None, x.make_scalar(4), None)}),
  647. y = opr::PowC::make(xsub, 4.f);
  648. auto y_infer = graph->static_infer_manager().infer_value(y.node());
  649. HostTensorND host_y;
  650. auto func = graph->compile({make_callback_copy(y, host_y)});
  651. func->execute();
  652. MGB_ASSERT_TENSOR_EQ(host_y, HostTensorND::make_proxy(y_infer));
  653. ASSERT_EQ(prev_dev_ptr(x), prev_dev_ptr(xsub));
  654. if (contig) {
  655. // inplace computing
  656. ASSERT_EQ(prev_dev_ptr(xsub), prev_dev_ptr(y));
  657. } else {
  658. ASSERT_NE(prev_dev_ptr(xsub), prev_dev_ptr(y));
  659. }
  660. };
  661. run(false);
  662. run(true);
  663. }
  664. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台