|
- /**
- * \file src/core/test/sublinear_memory.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-
- #include "megbrain/graph.h"
- #include "megbrain/graph/event.h"
- #include "megbrain/opr/basic_arith_wrapper.h"
- #include "megbrain/opr/blas.h"
- #include "megbrain/opr/dnn/convolution.h"
- #include "megbrain/opr/io.h"
- #include "megbrain/opr/tensor_manip.h"
- #include "megbrain/opr/utility.h"
- #include "megbrain/serialization/sereg.h"
- #include "megbrain/test/helper.h"
-
- using namespace mgb;
-
- #if MGB_ENABLE_SUBLINEAR
-
- namespace mgb {
- namespace cg {
-
- class SeqModifierForSublinearMemory {
- public:
- const CompNode::UnorderedMap<size_t>& prev_min_bottleneck();
- };
-
- class ComputingGraphImpl : public ComputingGraph {
- public:
- SeqModifierForSublinearMemory& seq_modifier_for_sublinear_memory();
- };
-
- }; // namespace cg
- }; // namespace mgb
-
- namespace {
-
- MGB_DEFINE_OPR_CLASS(SublinearBadOpr, cg::SingleCNOperatorNodeBase) // {
-
- bool m_flag;
- size_t m_scale;
-
- void scn_do_execute() override {
- mgb_assert(0);
- }
-
- NodeProp* do_make_node_prop() const override {
- auto prop = Super::do_make_node_prop();
- if (m_flag) {
- prop->add_flag(NodeProp::Flag::NO_AUTOMATIC_DUP);
- }
- return prop;
- }
-
- void init_output_static_infer_desc() override {
- using namespace cg::static_infer;
- auto &&mgr = owner_graph()->static_infer_manager();
- auto infer_shape = [this](TensorShape& dst, const InpVal &inp) {
- size_t n = inp.val.at(0).shape().total_nr_elems();
- dst = TensorShape{n * m_scale};
- return true;
- };
- mgr.register_shape_infer(output(0),
- {SourceType::DEP, {{input(0), DepType::SHAPE}}, infer_shape});
- }
-
- public:
- SublinearBadOpr(VarNode* inp, bool bad, size_t scale,
- OperatorNodeConfig config = {}):
- Super{inp->owner_graph(), config, "subliner_bad_op", {inp}},
- m_flag{bad}, m_scale{scale}
- {
- add_input({inp});
- add_output(None);
- }
-
- static SymbolVar make(SymbolVar inp, bool bad, size_t scale,
- OperatorNodeConfig config = {}) {
- return inp.node()->owner_graph()->insert_opr(
- std::make_unique<SublinearBadOpr>(inp.node(), bad, scale, config))
- ->output(0);
- }
-
- bool flag() const { return m_flag; }
- size_t scale() const { return m_scale; }
- };
- MGB_DYN_TYPE_OBJ_FINAL_IMPL(SublinearBadOpr);
-
- cg::OperatorNodeBase* bad_opr_shallow_copy(
- const serialization::OprShallowCopyContext &ctx,
- const cg::OperatorNodeBase &opr_,
- const VarNodeArray &inputs,
- const OperatorNodeConfig& config) {
- mgb_assert(inputs.size() == 1);
- auto &&opr = opr_.cast_final_safe<SublinearBadOpr>();
- return SublinearBadOpr::make(
- inputs[0], opr.flag(), opr.scale(), config).node()->owner_opr();
- }
-
- MGB_REG_OPR_SHALLOW_COPY(SublinearBadOpr, bad_opr_shallow_copy);
-
- }; // anonymous namespace
-
- #if MGB_CUDA
- #define CHECK_REQ \
- do { \
- /* force use gpu because on CPU it is too slow */ \
- REQUIRE_GPU(1); \
- if (CompNode::load("gpu0").get_mem_status_bytes().second <= \
- 5ull * 1024 * 1024 * 1024) { \
- mgb_log_warn( \
- "test skipped due to " \
- "insufficient available gpu memory"); \
- return; \
- } \
- } while (0)
-
- TEST(TestSublinearMemory, FullConv) {
- CHECK_REQ;
-
- HostTensorGenerator<> gen_;
- auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
- constexpr size_t N = 128, H = 256, W = 256;
- auto host_data = gen({N, 1, H, W});
-
- auto graph = ComputingGraph::make();
- SymbolVarArray params;
-
- auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
- out = data;
- size_t out_chl = host_data->shape(1), layer_count = 0;
- auto add_layer = [&](size_t oc, size_t h, size_t w) {
- gen_.std(sqrt(2.0 / (out_chl * h * w)));
- auto host_kern = gen({oc, out_chl, h, w});
- auto dev_kern = std::make_shared<DeviceTensorND>();
- dev_kern->copy_from(*host_kern);
- params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
- out = opr::relu(opr::Convolution::make(
- out, params.back().rename(ssprintf("param%zu", layer_count)),
- {}));
- out.rename(ssprintf("out%zu", layer_count));
- ++layer_count;
- out_chl = oc;
- };
-
- for (int i = 0; i < 10; ++i)
- add_layer(5, 3, 3);
-
- auto loss = opr::Dot::make(out.flatten(), out.flatten());
- std::vector<HostTensorND> grad_params_get(params.size());
- ComputingGraph::OutputSpec out_spec;
- for (size_t i = 0; i < params.size(); ++i) {
- out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
- grad_params_get[i]));
- }
-
- std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
- for (bool sublinear : {false, true}) {
- graph->options().enable_sublinear_memory_opt = sublinear;
- auto func = graph->compile(out_spec);
- func->execute();
- if (!sublinear) {
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- grad_params_expect[i].copy_from(grad_params_get[i]);
- }
- }
-
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
- }
-
- TEST(TestSublinearMemory, ConcatSplit) {
- CHECK_REQ;
-
- HostTensorGenerator<> gen_;
- auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
- constexpr size_t N = 128, H = 256, W = 256;
- auto host_data = gen({N, 2, H, W});
-
- auto graph = ComputingGraph::make();
- SymbolVarArray params;
-
- auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
- out = data;
- size_t out_chl = host_data->shape(1), layer_count = 0;
- auto add_layer = [&](size_t oc, size_t h, size_t w) {
- auto prev =
- opr::Split::make(out, opr::Split::Options::make_average(1, 2));
- SymbolVarArray cur_out(2);
- size_t cur_in_chl[] = {out_chl / 2, out_chl - out_chl / 2};
- size_t cur_out_chl[] = {oc / 2, oc - oc / 2};
- for (int i = 0; i < 2; ++i) {
- gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w)));
- auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w});
- auto dev_kern = std::make_shared<DeviceTensorND>();
- dev_kern->copy_from(*host_kern);
- params.emplace_back(
- opr::SharedDeviceTensor::make(*graph, dev_kern));
- cur_out[i] =
- opr::relu(opr::Convolution::make(
- prev[i],
- params.back().rename(ssprintf(
- "param%zu:%d", layer_count, i)),
- {}))
- .rename(ssprintf("out%zu:%d", layer_count, i));
- }
- ++layer_count;
- out_chl = oc;
- out = opr::Concat::make(cur_out, 1);
- };
-
- for (int i = 0; i < 10; ++i)
- add_layer(6, 3, 3);
-
- auto loss = opr::Dot::make(out.flatten(), out.flatten());
- std::vector<HostTensorND> grad_params_get(params.size());
- ComputingGraph::OutputSpec out_spec;
- for (size_t i = 0; i < params.size(); ++i) {
- out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
- grad_params_get[i]));
- }
-
- std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
- for (bool sublinear : {false, true}) {
- graph->options().enable_sublinear_memory_opt = sublinear;
- auto func = graph->compile(out_spec);
- func->execute();
- if (!sublinear) {
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- grad_params_expect[i].copy_from(grad_params_get[i]);
- }
- }
-
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
- }
-
- TEST(TestSublinearMemory, MultiOutputOpr) {
- CHECK_REQ;
-
- HostTensorGenerator<> gen_;
- auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
- constexpr size_t N = 128, H = 256, W = 256;
- auto host_data = gen({N, 3, H, W});
-
- auto graph = ComputingGraph::make();
- SymbolVarArray params;
-
- auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
- out = data;
- size_t out_chl = host_data->shape(1), layer_count = 0;
- auto add_layer = [&](size_t oc, size_t h, size_t w) {
- auto prev =
- opr::Split::make(out, opr::Split::Options::make_average(1, 3));
- SymbolVarArray cur_out(3);
- size_t cur_in_chl[] = {out_chl / 3, out_chl / 3, out_chl - out_chl / 3 * 2};
- size_t cur_out_chl[] = {oc / 3, oc / 3, oc - oc / 3 * 2};
- for (int i = 0; i < 3; ++i) {
- gen_.std(sqrt(2.0 / (cur_in_chl[i] * h * w)));
- auto host_kern = gen({cur_out_chl[i], cur_in_chl[i], h, w});
- auto dev_kern = std::make_shared<DeviceTensorND>();
- dev_kern->copy_from(*host_kern);
- params.emplace_back(
- opr::SharedDeviceTensor::make(*graph, dev_kern));
- auto f = opr::Convolution::make(
- prev[i], params.back().rename(ssprintf("param%zu:%d", layer_count, i)), {});
- if(i == 2)
- for(size_t j = 0; j < 10; ++ j)
- f = opr::relu(f);
- cur_out[i] = f;
- }
- ++layer_count;
- out_chl = oc;
- out = opr::Concat::make(cur_out, 1);
- };
-
- add_layer(6, 3, 3);
-
- auto loss = opr::Dot::make(out.flatten(), out.flatten());
- std::vector<HostTensorND> grad_params_get(params.size());
- ComputingGraph::OutputSpec out_spec;
- for (size_t i = 0; i < params.size(); ++i) {
- out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
- grad_params_get[i]));
- }
-
- std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
- for (bool sublinear : {false, true}) {
- graph->options().enable_sublinear_memory_opt = sublinear;
- auto func = graph->compile(out_spec);
- func->execute();
- if (!sublinear) {
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- grad_params_expect[i].copy_from(grad_params_get[i]);
- }
- }
-
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-3);
- }
-
- TEST(TestSublinearMemory, LongChain) {
- CHECK_REQ;
-
- HostTensorGenerator<> gen_;
- auto gen = [&](const TensorShape& shp) { return gen_(shp, "gpu0"); };
- constexpr size_t N = 32, C = 3, H = 224, W = 224;
- auto host_data = gen({N, C, H, W});
-
- auto graph = ComputingGraph::make();
- SymbolVarArray params;
-
- auto data = opr::Host2DeviceCopy::make(*graph, host_data).rename("data"),
- out = data;
- size_t out_chl = host_data->shape(1), layer_count = 0;
- opr::Convolution::Param conv_param;
- conv_param.pad_h = 1;
- conv_param.pad_w = 1;
- auto add_layer = [&](size_t oc, size_t h, size_t w) {
- gen_.std(sqrt(2.0 / (out_chl * h * w)));
- auto host_kern = gen({oc, out_chl, h, w});
- auto dev_kern = std::make_shared<DeviceTensorND>();
- dev_kern->copy_from(*host_kern);
- params.emplace_back(opr::SharedDeviceTensor::make(*graph, dev_kern));
- out = opr::relu(opr::Convolution::make(
- out, params.back().rename(ssprintf("param%zu", layer_count)),
- conv_param));
- out.rename(ssprintf("out%zu", layer_count));
- ++layer_count;
- out_chl = oc;
- };
-
- int OC[] = {1, 1, 1, 12, 1, 1, 1, 1, 15, 1};
- for (int i = 1; i <= 10; ++i) {
- for (int j = 0; j < 10; j++)
- add_layer(OC[j], 3, 3);
- }
-
- auto loss = opr::Dot::make(out.flatten(), out.flatten());
- std::vector<HostTensorND> grad_params_get(params.size());
- ComputingGraph::OutputSpec out_spec;
-
- for (int i = params.size() - 1; i >= 0; --i) {
- out_spec.emplace_back(make_callback_copy(cg::grad(loss, params[i]),
- grad_params_get[i]));
- }
-
- std::vector<HostTensorND> grad_params_expect(grad_params_get.size());
- for (bool sublinear : {false, true}) {
- graph->options().enable_sublinear_memory_opt = sublinear;
- auto func = graph->compile(out_spec);
- func->execute();
- func->to_json()->writeto_fpath(output_file(
- ssprintf("TestSublinearMemory.LongChain%d.json", sublinear)));
- if (!sublinear) {
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- grad_params_expect[i].copy_from(grad_params_get[i]);
- }
- }
-
- for (size_t i = 0; i < grad_params_get.size(); ++i)
- MGB_ASSERT_TENSOR_NEAR(grad_params_get[i], grad_params_expect[i], 1e-4);
- }
- #endif // MGB_CUDA
-
- TEST(TestSublinearMemory, MultiReuse) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- constexpr size_t N = 1024, NS = N * sizeof(dt_float32);
- auto host_x = gen({N}), host_y0 = gen({N * 2}), host_y1 = gen({N * 2}),
- host_z = gen({N});
- auto call_check = [&](SymbolVar val, const HostTensorND& expected) {
- auto cb = [expected](const DeviceTensorND& val) {
- HostTensorND get;
- get.copy_from(val).sync();
- MGB_ASSERT_TENSOR_EQ(expected, get);
- };
- return opr::CallbackInjector::make(val, {true, cb});
- };
- // x0 should be discarded after x2 finishes
- auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
- z0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_z),
- z1 = call_check(z0, *host_z), x1 = call_check(x0, *host_x),
- x2 = call_check(x0, *host_x),
- y0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y0),
- y01 = call_check(y0, *host_y0),
- y1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_y1),
- y11 = call_check(y1, *host_y1), x3 = call_check(x0, *host_x);
- SymbolVar vars[] = {x0, z0, z1, x1, x2, y0, y01, y1, y11, x3};
- ComputingGraph::OutputSpec out_spec;
- for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) {
- set_priority(vars[i], i);
- out_spec.push_back({vars[i], {}});
- }
-
- size_t alloc_size = 0;
- auto alloc_size_hdl =
- graph->event().register_receiver<cg::event::StaticMemAlloc>(
- [&](const cg::event::StaticMemAlloc& s) {
- if (s.comp_node.valid()) {
- alloc_size = s.alloc_size;
- }
- });
-
- graph->options().enable_sublinear_memory_opt = true;
- auto func = graph->compile(out_spec);
- func->execute();
- ASSERT_GT(alloc_size, 0u);
- ASSERT_LT(alloc_size, NS * 2 + (NS / 2));
- }
-
- TEST(TestSublinearMemory, DynamicShape) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- constexpr size_t N = 1024, NS = N * sizeof(dt_float32);
- auto host_x = gen({N}), host_p = gen({N}), host_t = gen({N / 2 + 1, 2});
- auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x).rename("x"),
- y0 = (x + 1.f).rename("y0"), y1 = (y0 + .4f).rename("y1"),
- p = opr::Host2DeviceCopy::make_no_fwd(*graph, host_p).rename("p"),
- po0 = (p + .5f).rename("po0"), po1 = (p + .4f).rename("po1"),
- po = (po0 + po1).rename("po"), xt = (x + .5f).rename("xt"),
- xdyn = opr::MarkDynamicVar::make(xt),
- t1_shp = (opr::GetVarShape::make(xdyn) + 2).rename("t0"),
- t0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_t),
- t1 = t0.reshape(t1_shp);
- set_priority(y0, 1);
- set_priority(y1, 1);
- set_priority(p, 2);
- set_priority(po, 2);
- set_priority(xt, 3);
- set_priority(xdyn, 4);
- set_priority(t0, 5);
- HostTensorND host_y1, host_t1;
-
- size_t alloc_size = 0;
- auto alloc_size_hdl =
- graph->event().register_receiver<cg::event::StaticMemAlloc>(
- [&](const cg::event::StaticMemAlloc& s) {
- if (s.comp_node.valid()) {
- alloc_size = s.alloc_size;
- }
- });
-
- graph->options().graph_opt_level = 0;
- graph->options().enable_sublinear_memory_opt = true;
- auto func = graph->compile({make_callback_copy(y1, host_y1),
- {po, {}},
- make_callback_copy(t1, host_t1)});
- func->execute().to_json()->writeto_fpath(
- output_file("TestSublinearMemory.DynamicShape.json"));
- ASSERT_GT(alloc_size, 0u);
- ASSERT_LT(alloc_size, NS * 2 + NS / 2);
-
- auto px = host_x->ptr<float>(), py = host_y1.ptr<float>();
- for (size_t i = 0; i < N; ++i) {
- MGB_ASSERT_FLOAT_EQ(px[i] + 1.4f, py[i]);
- }
- host_t->resize({N + 2});
- MGB_ASSERT_TENSOR_EQ(*host_t, host_t1);
- }
-
- TEST(TestSublinearMemory, EmptyGraph) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- graph->options().enable_sublinear_memory_opt = true;
- auto x = opr::SharedDeviceTensor::make(*graph, *gen({1}));
- auto func = graph->compile({{x, {}}});
- func->execute();
- }
-
- TEST(TestSublinearMemory, DepsInTopoSort) {
- HostTensorGenerator<> gen;
- auto graph = ComputingGraph::make();
- constexpr size_t N = 1024;
- auto host_x0 = gen({N}), host_x1 = gen({N}), host_x2 = gen({N}),
- host_x3 = gen({N});
- auto x0 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x0),
- x1 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x1),
- x2 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x2),
- x3 = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x3),
- x4 = opr::SharedDeviceTensor::make(*graph, *host_x0), y0 = x3 + x4,
- y1 = y0 + x2, y2 = y1 + x1, y3 = y2 + x0,
- y4 = opr::AddUpdate::make(x4, y3);
- SymbolVar vars[] = {x0, x1, x2, x3, x4, y0, y1, y2, y3, y4};
- ComputingGraph::OutputSpec out_spec;
- for (size_t i = 0; i < sizeof(vars) / sizeof(vars[0]); ++i) {
- set_priority(vars[i], i);
- out_spec.push_back({vars[i], {}});
- }
- graph->options().graph_opt_level = 0;
- for (bool enable_sublinear : {false, true}) {
- graph->options().enable_sublinear_memory_opt = enable_sublinear;
- auto func = graph->compile(out_spec);
- ASSERT_EQ(1u, y4.node()->owner_opr()->node_prop().dep_map().count(
- y0.node()));
- }
- }
-
- TEST(TestSublinearMemory, BadOpr) {
- HostTensorGenerator<> gen;
- auto cn = CompNode::load("xpu0");
- constexpr size_t N = 1024, Scale = 2;
- auto host_x = gen({N}, cn);
- for (bool bad : {false, true}) {
- auto graph = ComputingGraph::make();
- auto x = opr::Host2DeviceCopy::make_no_fwd(*graph, host_x),
- bad_var = SublinearBadOpr::make(x, bad, Scale),
- y0 = opr::reduce_sum(bad_var, x.make_scalar_dt(1)),
- y1 = SublinearBadOpr::make(y0, false, N * Scale),
- y = y1 + 1,
- z = opr::reduce_max(bad_var, x.make_scalar_dt(1));
- set_priority(y0, 0);
- set_priority(y1, 1);
- set_priority(y, 2);
- set_priority(z, 3);
- graph->options().graph_opt_level = 0;
- graph->options().enable_sublinear_memory_opt = 1;
- graph->options().sublinear_mem_cofig.genetic_nr_iter = 50;
- auto func = graph->compile({{y, {}}, {z, {}}});
- auto&& results = static_cast<cg::ComputingGraphImpl*>(graph.get())
- ->seq_modifier_for_sublinear_memory().prev_min_bottleneck();
- // bottleneck:
- // if bad : y = y1 + 1, bad_var should be saved to calculate
- // z later, total memory usage is
- // N * sclae * 2(bad_var and y1) + 1 (immutable tensor 1)
- // else : bad_var = BadOpr(x), total memory usage is
- // N(x) + N * scale(bad_var), bad_var would be recomputed
- // when calculate z = reduce(bad_var)
- size_t expect = bad ? N * Scale * 2 + 1 : N * Scale + N;
- ASSERT_EQ(results.at(cn), expect * host_x->dtype().size());
- size_t nr_bad_opr = 0;
- auto count_up = [&nr_bad_opr](cg::OperatorNodeBase* op) {
- if (op->dyn_typeinfo() == SublinearBadOpr::typeinfo()) {
- ++ nr_bad_opr;
- }
- return true;
- };
- func->iter_opr_seq(count_up);
- ASSERT_EQ(nr_bad_opr, bad ? 2 : 3);
- }
- }
-
- #else
- #pragma message "tests are disabled as Sublinear is not enabled."
- #endif // MGB_ENABLE_SUBLINEAR
-
- // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
|