You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

sublinear_memory.cpp 22 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554
  1. /**
  2. * \file src/core/test/sublinear_memory.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/graph.h"
  12. #include "megbrain/graph/event.h"
  13. #include "megbrain/opr/basic_arith_wrapper.h"
  14. #include "megbrain/opr/blas.h"
  15. #include "megbrain/opr/dnn/convolution.h"
  16. #include "megbrain/opr/io.h"
  17. #include "megbrain/opr/tensor_manip.h"
  18. #include "megbrain/opr/utility.h"
  19. #include "megbrain/serialization/sereg.h"
  20. #include "megbrain/test/helper.h"
  21. using namespace mgb;
  22. #if MGB_ENABLE_SUBLINEAR
  23. namespace mgb {
  24. namespace cg {
  25. class SeqModifierForSublinearMemory {
  26. public:
  27. const CompNode::UnorderedMap<size_t>& prev_min_bottleneck();
  28. };
  29. class ComputingGraphImpl : public ComputingGraph {
  30. public:
  31. SeqModifierForSublinearMemory& seq_modifier_for_sublinear_memory();
  32. };
  33. }; // namespace cg
  34. }; // namespace mgb
  35. namespace {
  36. MGB_DEFINE_OPR_CLASS(SublinearBadOpr, cg::SingleCNOperatorNodeBase) // {
  37. bool m_flag;
  38. size_t m_scale;
  39. void scn_do_execute() override {
  40. mgb_assert(0);
  41. }
  42. NodeProp* do_make_node_prop() const override {
  43. auto prop = Super::do_make_node_prop();
  44. if (m_flag) {
  45. prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
  46. }
  47. return prop;
  48. }
  49. void init_output_static_infer_desc() override {
  50. using namespace cg::static_infer;
  51. auto &&mgr = owner_graph()->static_infer_manager();
  52. auto infer_shape = [this](TensorShape& dst, const InpVal &inp) {
  53. size_t n = inp.val.at(0).shape().total_nr_elems();
  54. dst = TensorShape{n * m_scale};
  55. return true;
  56. };
  57. mgr.register_shape_infer(output(0),
  58. {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_shape});
  59. }
  60. public:
  61. SublinearBadOpr(VarNode* inp, bool bad, size_t scale,
  62. OperatorNodeConfig config = {}):
  63. Super{inp->owner_graph(), config, "subliner_bad_op", {inp}},
  64. m_flag{bad}, m_scale{scale}
  65. {
  66. add_input({inp});
  67. add_output(None);
  68. }
  69. static SymbolVar make(SymbolVar inp, bool bad, size_t scale,
  70. OperatorNodeConfig config = {}) {
  71. return inp.node()->owner_graph()->insert_opr(
  72. std::make_unique<SublinearBadOpr>(inp.node(), bad, scale, config))
  73. ->output(0);
  74. }
  75. bool flag() const { return m_flag; }
  76. size_t scale() const { return m_scale; }
  77. };
  78. MGB_DYN_TYPE_OBJ_FINAL_IMPL(SublinearBadOpr);
  79. cg::OperatorNodeBase* bad_opr_shallow_copy(
  80. const serialization::OprShallowCopyContext &ctx,
  81. const cg::OperatorNodeBase &opr_,
  82. const VarNodeArray &inputs,
  83. const OperatorNodeConfig& config) {
  84. mgb_assert(inputs.size() == 1);
  85. auto &&opr = opr_.cast_final_safe<SublinearBadOpr>();
  86. return SublinearBadOpr::make(
  87. inputs[0], opr.flag(), opr.scale(), config).node()->owner_opr();
  88. }
  89. MGB_REG_OPR_SHALLOW_COPY(SublinearBadOpr, bad_opr_shallow_copy);
  90. }; // anonymous namespace
  91. #if MGB_CUDA
  92. #define CHECK_REQ \
  93. do { \
  94. /* force use gpu because on CPU it is too slow */ \
  95. REQUIRE_GPU(1); \
  96. if (CompNode::load("gpu0").get_mem_status_bytes().second <= \
  97. 5ull * 1024 * 1024 * 1024) { \
  98. mgb_log_warn( \
  99. "test skipped due to " \
  100. "insufficient available gpu memory"); \
  101. return; \
  102. } \
  103. } while (0)
  104. TEST(TestSublinearMemory, FullConv) {
  105. CHECK_REQ;
  106. HostTensorGenerator<> gen_;
  107. auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
  108. constexpr size_t N = 128, H = 256, W = 256;
  109. auto host_data = gen({N, 1, H, W});
  110. auto graph = ComputingGraph::make();
  111. SymbolVarArray params;
  112. auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
  113. out = data;
  114. size_t out_chl = host_data->shape(1), layer_count = 0;
  115. auto add_layer = [&](size_t oc, size_t h, size_t w) {
  116. gen_.std(sqrt(2.0 / (out_chl * h * w)));
  117. auto host_kern = gen({oc, out_chl, h, w});
  118. auto dev_kern = std::make_shared<DeviceTensorND>();
  119. dev_kern->copy_from(*host_kern);
  120. params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
  121. out = opr::relu(opr::Convolution::make(
  122. out, params.back().rename(ssprintf("param%zu", layer_count)),
  123. {}));
  124. out.rename(ssprintf("out%zu", layer_count));
  125. ++layer_count;
  126. out_chl = oc;
  127. };
  128. for (int i = 0; i < 10; ++i)
  129. add_layer(5, 3, 3);
  130. auto loss = opr::Dot::make(out.flatten(), out.flatten());
  131. std::vector<HostTensorND> grad_params_get(params.size());
  132. ComputingGraph::OutputSpec out_spec;
  133. for (size_t i = 0; i < params.size(); ++i) {
  134. out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
  135. grad_params_get[i]));
  136. }
  137. std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
  138. for (bool sublinear : {false, true}) {
  139. graph->options().enable_sublinear_memory_opt = sublinear;
  140. auto func = graph->compile(out_spec);
  141. func->execute();
  142. if (!sublinear) {
  143. for (size_t i = 0; i < grad_params_get.size(); ++i)
  144. grad_params_expect[i].copy_from(grad_params_get[i]);
  145. }
  146. }
  147. for (size_t i = 0; i < grad_params_get.size(); ++i)
  148. MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
  149. }
  150. TEST(TestSublinearMemory, ConcatSplit) {
  151. CHECK_REQ;
  152. HostTensorGenerator<> gen_;
  153. auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
  154. constexpr size_t N = 128, H = 256, W = 256;
  155. auto host_data = gen({N, 2, H, W});
  156. auto graph = ComputingGraph::make();
  157. SymbolVarArray params;
  158. auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
  159. out = data;
  160. size_t out_chl = host_data->shape(1), layer_count = 0;
  161. auto add_layer = [&](size_t oc, size_t h, size_t w) {
  162. auto prev =
  163. opr::Split::make(out, opr::Split::Options::make_average(1, 2));
  164. SymbolVarArray cur_out(2);
  165. size_t cur_in_chl[] = {out_chl / 2, out_chl - out_chl / 2};
  166. size_t cur_out_chl[] = {oc / 2, oc - oc / 2};
  167. for (int i = 0; i < 2; ++i) {
  168. gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w)));
  169. auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w});
  170. auto dev_kern = std::make_shared<DeviceTensorND>();
  171. dev_kern->copy_from(*host_kern);
  172. params.emplace_back(
  173. opr::SharedDeviceTensor::make(*graph, dev_kern));
  174. cur_out[i] =
  175. opr::relu(opr::Convolution::make(
  176. prev[i],
  177. params.back().rename(ssprintf(
  178. "param%zu:%d", layer_count, i)),
  179. {}))
  180. .rename(ssprintf("out%zu:%d", layer_count, i));
  181. }
  182. ++layer_count;
  183. out_chl = oc;
  184. out = opr::Concat::make(cur_out, 1);
  185. };
  186. for (int i = 0; i < 10; ++i)
  187. add_layer(6, 3, 3);
  188. auto loss = opr::Dot::make(out.flatten(), out.flatten());
  189. std::vector<HostTensorND> grad_params_get(params.size());
  190. ComputingGraph::OutputSpec out_spec;
  191. for (size_t i = 0; i < params.size(); ++i) {
  192. out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
  193. grad_params_get[i]));
  194. }
  195. std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
  196. for (bool sublinear : {false, true}) {
  197. graph->options().enable_sublinear_memory_opt = sublinear;
  198. auto func = graph->compile(out_spec);
  199. func->execute();
  200. if (!sublinear) {
  201. for (size_t i = 0; i < grad_params_get.size(); ++i)
  202. grad_params_expect[i].copy_from(grad_params_get[i]);
  203. }
  204. }
  205. for (size_t i = 0; i < grad_params_get.size(); ++i)
  206. MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
  207. }
  208. TEST(TestSublinearMemory, MultiOutputOpr) {
  209. CHECK_REQ;
  210. HostTensorGenerator<> gen_;
  211. auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
  212. constexpr size_t N = 128, H = 256, W = 256;
  213. auto host_data = gen({N, 3, H, W});
  214. auto graph = ComputingGraph::make();
  215. SymbolVarArray params;
  216. auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
  217. out = data;
  218. size_t out_chl = host_data->shape(1), layer_count = 0;
  219. auto add_layer = [&](size_t oc, size_t h, size_t w) {
  220. auto prev =
  221. opr::Split::make(out, opr::Split::Options::make_average(1, 3));
  222. SymbolVarArray cur_out(3);
  223. size_t cur_in_chl[] = {out_chl / 3, out_chl / 3, out_chl - out_chl / 3 * 2};
  224. size_t cur_out_chl[] = {oc / 3, oc / 3, oc - oc / 3 * 2};
  225. for (int i = 0; i < 3; ++i) {
  226. gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w)));
  227. auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w});
  228. auto dev_kern = std::make_shared<DeviceTensorND>();
  229. dev_kern->copy_from(*host_kern);
  230. params.emplace_back(
  231. opr::SharedDeviceTensor::make(*graph, dev_kern));
  232. auto f = opr::Convolution::make(
  233. prev[i], params.back().rename(ssprintf("param%zu:%d", layer_count, i)), {});
  234. if(i == 2)
  235. for(size_t j = 0; j < 10; ++ j)
  236. f = opr::relu(f);
  237. cur_out[i] = f;
  238. }
  239. ++layer_count;
  240. out_chl = oc;
  241. out = opr::Concat::make(cur_out, 1);
  242. };
  243. add_layer(6, 3, 3);
  244. auto loss = opr::Dot::make(out.flatten(), out.flatten());
  245. std::vector<HostTensorND> grad_params_get(params.size());
  246. ComputingGraph::OutputSpec out_spec;
  247. for (size_t i = 0; i < params.size(); ++i) {
  248. out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
  249. grad_params_get[i]));
  250. }
  251. std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
  252. for (bool sublinear : {false, true}) {
  253. graph->options().enable_sublinear_memory_opt = sublinear;
  254. auto func = graph->compile(out_spec);
  255. func->execute();
  256. if (!sublinear) {
  257. for (size_t i = 0; i < grad_params_get.size(); ++i)
  258. grad_params_expect[i].copy_from(grad_params_get[i]);
  259. }
  260. }
  261. for (size_t i = 0; i < grad_params_get.size(); ++i)
  262. MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
  263. }
  264. TEST(TestSublinearMemory, LongChain) {
  265. CHECK_REQ;
  266. HostTensorGenerator<> gen_;
  267. auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
  268. constexpr size_t N = 32, C = 3, H = 224, W = 224;
  269. auto host_data = gen({N, C, H, W});
  270. auto graph = ComputingGraph::make();
  271. SymbolVarArray params;
  272. auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
  273. out = data;
  274. size_t out_chl = host_data->shape(1), layer_count = 0;
  275. opr::Convolution::Param conv_param;
  276. conv_param.pad_h = 1;
  277. conv_param.pad_w = 1;
  278. auto add_layer = [&](size_t oc, size_t h, size_t w) {
  279. gen_.std(sqrt(2.0 / (out_chl * h * w)));
  280. auto host_kern = gen({oc, out_chl, h, w});
  281. auto dev_kern = std::make_shared<DeviceTensorND>();
  282. dev_kern->copy_from(*host_kern);
  283. params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
  284. out = opr::relu(opr::Convolution::make(
  285. out, params.back().rename(ssprintf("param%zu", layer_count)),
  286. conv_param));
  287. out.rename(ssprintf("out%zu", layer_count));
  288. ++layer_count;
  289. out_chl = oc;
  290. };
  291. int OC[] = {1, 1, 1, 12, 1, 1, 1, 1, 15, 1};
  292. for (int i = 1; i <= 10; ++i) {
  293. for (int j = 0; j < 10; j++)
  294. add_layer(OC[j], 3, 3);
  295. }
  296. auto loss = opr::Dot::make(out.flatten(), out.flatten());
  297. std::vector<HostTensorND> grad_params_get(params.size());
  298. ComputingGraph::OutputSpec out_spec;
  299. for (int i = params.size() - 1; i >= 0; --i) {
  300. out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
  301. grad_params_get[i]));
  302. }
  303. std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
  304. for (bool sublinear : {false, true}) {
  305. graph->options().enable_sublinear_memory_opt = sublinear;
  306. auto func = graph->compile(out_spec);
  307. func->execute();
  308. func->to_json()->writeto_fpath(output_file(
  309. ssprintf("TestSublinearMemory.LongChain%d.json", sublinear)));
  310. if (!sublinear) {
  311. for (size_t i = 0; i < grad_params_get.size(); ++i)
  312. grad_params_expect[i].copy_from(grad_params_get[i]);
  313. }
  314. }
  315. for (size_t i = 0; i < grad_params_get.size(); ++i)
  316. MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4);
  317. }
  318. #endif // MGB_CUDA
  319. TEST(TestSublinearMemory, MultiReuse) {
  320. HostTensorGenerator<> gen;
  321. auto graph = ComputingGraph::make();
  322. constexpr size_t N = 1024, NS = N * sizeof(dt_float32);
  323. auto host_x = gen({N}), host_y0 = gen({N * 2}), host_y1 = gen({N * 2}),
  324. host_z = gen({N});
  325. auto call_check = [&](SymbolVar val, const HostTensorND& expected) {
  326. auto cb = [expected](const DeviceTensorND& val) {
  327. HostTensorND get;
  328. get.copy_from(val).sync();
  329. MGB_ASSERT_TENSOR_EQ(expected, get);
  330. };
  331. return opr::CallbackInjector::make(val, {true, cb});
  332. };
  333. // x0 should be discarded after x2 finishes
  334. auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
  335. z0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_z),
  336. z1 = call_check(z0, *host_z), x1 = call_check(x0, *host_x),
  337. x2 = call_check(x0, *host_x),
  338. y0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y0),
  339. y01 = call_check(y0, *host_y0),
  340. y1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y1),
  341. y11 = call_check(y1, *host_y1), x3 = call_check(x0, *host_x);
  342. SymbolVar vars[] = {x0, z0, z1, x1, x2, y0, y01, y1, y11, x3};
  343. ComputingGraph::OutputSpec out_spec;
  344. for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) {
  345. set_priority(vars[i], i);
  346. out_spec.push_back({vars[i], {}});
  347. }
  348. size_t alloc_size = 0;
  349. auto alloc_size_hdl =
  350. graph->event().register_receiver<cg::event::StaticMemAlloc>(
  351. [&](const cg::event::StaticMemAlloc& s) {
  352. if (s.comp_node.valid()) {
  353. alloc_size = s.alloc_size;
  354. }
  355. });
  356. graph->options().enable_sublinear_memory_opt = true;
  357. auto func = graph->compile(out_spec);
  358. func->execute();
  359. ASSERT_GT(alloc_size, 0u);
  360. ASSERT_LT(alloc_size, NS * 2 + (NS / 2));
  361. }
  362. TEST(TestSublinearMemory, DynamicShape) {
  363. HostTensorGenerator<> gen;
  364. auto graph = ComputingGraph::make();
  365. constexpr size_t N = 1024, NS = N * sizeof(dt_float32);
  366. auto host_x = gen({N}), host_p = gen({N}), host_t = gen({N / 2 + 1, 2});
  367. auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
  368. y0 = (x + 1.f).rename("y0"), y1 = (y0 + .4f).rename("y1"),
  369. p = opr::Host2DeviceCopy::make_no_fwd(*graph, host_p).rename("p"),
  370. po0 = (p + .5f).rename("po0"), po1 = (p + .4f).rename("po1"),
  371. po = (po0 + po1).rename("po"), xt = (x + .5f).rename("xt"),
  372. xdyn = opr::MarkDynamicVar::make(xt),
  373. t1_shp = (opr::GetVarShape::make(xdyn) + 2).rename("t0"),
  374. t0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_t),
  375. t1 = t0.reshape(t1_shp);
  376. set_priority(y0, 1);
  377. set_priority(y1, 1);
  378. set_priority(p, 2);
  379. set_priority(po, 2);
  380. set_priority(xt, 3);
  381. set_priority(xdyn, 4);
  382. set_priority(t0, 5);
  383. HostTensorND host_y1, host_t1;
  384. size_t alloc_size = 0;
  385. auto alloc_size_hdl =
  386. graph->event().register_receiver<cg::event::StaticMemAlloc>(
  387. [&](const cg::event::StaticMemAlloc& s) {
  388. if (s.comp_node.valid()) {
  389. alloc_size = s.alloc_size;
  390. }
  391. });
  392. graph->options().graph_opt_level = 0;
  393. graph->options().enable_sublinear_memory_opt = true;
  394. auto func = graph->compile({make_callback_copy(y1, host_y1),
  395. {po, {}},
  396. make_callback_copy(t1, host_t1)});
  397. func->execute().to_json()->writeto_fpath(
  398. output_file("TestSublinearMemory.DynamicShape.json"));
  399. ASSERT_GT(alloc_size, 0u);
  400. ASSERT_LT(alloc_size, NS * 2 + NS / 2);
  401. auto px = host_x->ptr<float>(), py = host_y1.ptr<float>();
  402. for (size_t i = 0; i < N; ++i) {
  403. MGB_ASSERT_FLOAT_EQ(px[i] + 1.4f, py[i]);
  404. }
  405. host_t->resize({N + 2});
  406. MGB_ASSERT_TENSOR_EQ(*host_t, host_t1);
  407. }
  408. TEST(TestSublinearMemory, EmptyGraph) {
  409. HostTensorGenerator<> gen;
  410. auto graph = ComputingGraph::make();
  411. graph->options().enable_sublinear_memory_opt = true;
  412. auto x = opr::SharedDeviceTensor::make(*graph, *gen({1}));
  413. auto func = graph->compile({{x, {}}});
  414. func->execute();
  415. }
  416. TEST(TestSublinearMemory, DepsInTopoSort) {
  417. HostTensorGenerator<> gen;
  418. auto graph = ComputingGraph::make();
  419. constexpr size_t N = 1024;
  420. auto host_x0 = gen({N}), host_x1 = gen({N}), host_x2 = gen({N}),
  421. host_x3 = gen({N});
  422. auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x0),
  423. x1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x1),
  424. x2 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x2),
  425. x3 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x3),
  426. x4 = opr::SharedDeviceTensor::make(*graph, *host_x0), y0 = x3 + x4,
  427. y1 = y0 + x2, y2 = y1 + x1, y3 = y2 + x0,
  428. y4 = opr::AddUpdate::make(x4, y3);
  429. SymbolVar vars[] = {x0, x1, x2, x3, x4, y0, y1, y2, y3, y4};
  430. ComputingGraph::OutputSpec out_spec;
  431. for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) {
  432. set_priority(vars[i], i);
  433. out_spec.push_back({vars[i], {}});
  434. }
  435. graph->options().graph_opt_level = 0;
  436. for (bool enable_sublinear : {false, true}) {
  437. graph->options().enable_sublinear_memory_opt = enable_sublinear;
  438. auto func = graph->compile(out_spec);
  439. ASSERT_EQ(1u, y4.node()->owner_opr()->node_prop().dep_map().count(
  440. y0.node()));
  441. }
  442. }
  443. TEST(TestSublinearMemory, BadOpr) {
  444. HostTensorGenerator<> gen;
  445. auto cn = CompNode::load("xpu0");
  446. constexpr size_t N = 1024, Scale = 2;
  447. auto host_x = gen({N}, cn);
  448. for (bool bad : {false, true}) {
  449. auto graph = ComputingGraph::make();
  450. auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
  451. bad_var = SublinearBadOpr::make(x, bad, Scale),
  452. y0 = opr::reduce_sum(bad_var, x.make_scalar_dt(1)),
  453. y1 = SublinearBadOpr::make(y0, false, N * Scale),
  454. y = y1 + 1,
  455. z = opr::reduce_max(bad_var, x.make_scalar_dt(1));
  456. set_priority(y0, 0);
  457. set_priority(y1, 1);
  458. set_priority(y, 2);
  459. set_priority(z, 3);
  460. graph->options().graph_opt_level = 0;
  461. graph->options().enable_sublinear_memory_opt = 1;
  462. graph->options().sublinear_mem_cofig.genetic_nr_iter = 50;
  463. auto func = graph->compile({{y, {}}, {z, {}}});
  464. auto&& results = static_cast<cg::ComputingGraphImpl*>(graph.get())
  465. ->seq_modifier_for_sublinear_memory().prev_min_bottleneck();
  466. // bottleneck:
  467. // if bad : y = y1 + 1, bad_var should be saved to calculate
  468. // z later, total memory usage is
  469. // N * sclae * 2(bad_var and y1) + 1 (immutable tensor 1)
  470. // else : bad_var = BadOpr(x), total memory usage is
  471. // N(x) + N * scale(bad_var), bad_var would be recomputed
  472. // when calculate z = reduce(bad_var)
  473. size_t expect = bad ? N * Scale * 2 + 1 : N * Scale + N;
  474. ASSERT_EQ(results.at(cn), expect * host_x->dtype().size());
  475. size_t nr_bad_opr = 0;
  476. auto count_up = [&nr_bad_opr](cg::OperatorNodeBase* op) {
  477. if (op->dyn_typeinfo() == SublinearBadOpr::typeinfo()) {
  478. ++ nr_bad_opr;
  479. }
  480. return true;
  481. };
  482. func->iter_opr_seq(count_up);
  483. ASSERT_EQ(nr_bad_opr, bad ? 2 : 3);
  484. }
  485. }
  486. #else
  487. #pragma message "tests are disabled as Sublinear is not enabled."
  488. #endif // MGB_ENABLE_SUBLINEAR
  489. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台