From d90cb7763c49385838a62e274f8b3b5cf3fe8e48 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 24 Nov 2021 14:15:47 +0800 Subject: [PATCH] feat(src/core): record support change ptr basic GitOrigin-RevId: b12f1c4a6655aeb8aeb28d660a34d0b6d84b37cb --- dnn/include/megdnn/basic_types.h | 75 ++++++++++- dnn/src/common/basic_types.cpp | 10 ++ imperative/src/impl/ops/tensor_manip.cpp | 4 +- imperative/src/impl/tensor_sanity_check.cpp | 2 +- lite/src/mge/network_impl.cpp | 26 +++- lite/src/mge/tensor_impl.cpp | 23 ++-- lite/src/mge/tensor_impl.h | 1 + lite/test/test_network.cpp | 64 ++++++++- src/core/impl/graph/cg_impl_seq.cpp | 7 +- src/core/impl/graph/var_node_mem_mgr.cpp | 2 +- src/core/impl/tensor.cpp | 34 ++++- src/core/include/megbrain/tensor.h | 24 +++- src/core/test/comp_node.cpp | 75 +++++++++++ src/gopt/test/no_memory_copy.cpp | 199 +++++++++++++++++++++++++++- src/jit/impl/nvrtc/compiler_cuda.cpp | 6 +- src/opr/impl/basic_arith.cpp | 16 ++- src/opr/impl/cond.cpp | 2 +- src/opr/impl/misc.cpp | 4 +- src/opr/impl/tensor_manip.cpp | 2 +- src/opr/test/dnn/convolution.cpp | 4 +- src/plugin/impl/var_sanity_check.cpp | 2 +- 21 files changed, 533 insertions(+), 49 deletions(-) diff --git a/dnn/include/megdnn/basic_types.h b/dnn/include/megdnn/basic_types.h index 44831f6d..673ed7af 100644 --- a/dnn/include/megdnn/basic_types.h +++ b/dnn/include/megdnn/basic_types.h @@ -15,6 +15,8 @@ #include "megdnn/dtype.h" #include "megdnn/internal/defs.h" +#include + #if MEGDNN_CC_HOST #include #include @@ -402,31 +404,94 @@ struct TensorLayout : public TensorShape { MGE_WIN_DECLSPEC_FUC size_t access_bytes() const; }; +class RefPtr { + std::shared_ptr m_ref; + size_t m_offset; + bool m_mutable; + +public: + RefPtr() { + m_ref = std::make_shared((void*)nullptr); + m_offset = 0; + m_mutable = true; + } + + RefPtr(void* ref_ptr, const size_t offset = 0) { + m_ref = std::make_shared(ref_ptr); + m_offset = offset; + m_mutable = true; + } + + explicit RefPtr( + std::shared_ptr ref_ptr, const size_t offset = 0, + bool is_mutable = true) { + m_ref = ref_ptr; + m_offset = offset; + m_mutable = is_mutable; + } + + void* get_ptr() const { + return static_cast( + (*m_ref != NULL) ? static_cast(*m_ref) + m_offset : nullptr); + } + + bool is_mutable() const { return m_mutable; } + + void reset(const void* ptr, size_t offset = 0); + + RefPtr& operator+=(size_t offset) { + m_offset += offset; + return *this; + } + + bool operator==(const RefPtr& other) const { + return *m_ref == *other.m_ref && m_offset == other.m_offset; + } + + template + T* ptr() const { + return static_cast(get_ptr()); + } +}; + /** * \brief A simple encapsulation class for n-dimensional tensor. */ struct TensorND { - void* raw_ptr; TensorLayout layout; - TensorND() : raw_ptr(NULL) {} + TensorND() : m_ref_ptr(RefPtr((void*)nullptr)) {} TensorND(void* raw_ptr_, const TensorLayout& layout_) - : raw_ptr(raw_ptr_), layout(layout_) {} + : layout(layout_), m_ref_ptr(raw_ptr_) {} + + TensorND(const TensorLayout& layout_, const RefPtr& ref_ptr) + : layout(layout_), m_ref_ptr(ref_ptr) {} + + MGE_WIN_DECLSPEC_FUC void reset_ptr(void* ptr, size_t offset = 0); + + void* raw_ptr() const { return m_ref_ptr.get_ptr(); } + + const RefPtr get_ref_ptr() const { return m_ref_ptr; } + + RefPtr& get_ref_ptr() { return m_ref_ptr; } //! get typed pointer; type check is performed template T* ptr() const { layout.dtype.assert_is_ctype(); - return static_cast(raw_ptr); + return static_cast(m_ref_ptr.get_ptr()); } //! get typed pointer of compatible type template T* compatible_ptr() const { layout.dtype.assert_is_compatible_ctype(); - return reinterpret_cast(raw_ptr); + return reinterpret_cast(m_ref_ptr.get_ptr()); } + +private: + RefPtr m_ref_ptr; }; #if MEGDNN_CC_HOST diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp index 9f3dca61..ba92ccb5 100644 --- a/dnn/src/common/basic_types.cpp +++ b/dnn/src/common/basic_types.cpp @@ -605,4 +605,14 @@ std::string TensorLayout::serialize() const { return rst; } +void RefPtr::reset(const void* ptr, size_t offset) { + megdnn_assert(m_mutable, "this RefPtr can't change."); + *m_ref = const_cast(ptr); + m_offset = offset; +} + +void TensorND::reset_ptr(void* ptr, size_t offset) { + m_ref_ptr.reset(ptr, offset); +} + // vim: syntax=cpp.doxygen diff --git a/imperative/src/impl/ops/tensor_manip.cpp b/imperative/src/impl/ops/tensor_manip.cpp index cd249023..ff4e5b07 100644 --- a/imperative/src/impl/ops/tensor_manip.cpp +++ b/imperative/src/impl/ops/tensor_manip.cpp @@ -342,7 +342,7 @@ void param_pack_concat_execute( [comp_node](dt_byte* ptr) { comp_node.free_host(ptr); }}; TensorLayout srcs_layout = TensorLayout{{nr_inputs}, dtype::Int32()}; for (size_t i = 0; i < nr_inputs; ++i) { - srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr; + srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); } HostTensorStorage srcs_storage; srcs_storage.reset(comp_node, srcs_size, srcs_ptr); @@ -392,7 +392,7 @@ SmallVector param_pack_concat_apply_on_physical_tensor( src_shapes, inputs.back()->shape(), TensorShape{}); } for (size_t i = 0; i < nr_inputs; ++i) { - srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr; + srcs_raw_ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); } HostTensorStorage srcs_storage; srcs_storage.reset(comp_node, srcs_size, srcs_ptr); diff --git a/imperative/src/impl/tensor_sanity_check.cpp b/imperative/src/impl/tensor_sanity_check.cpp index ca4b46de..b62201d7 100644 --- a/imperative/src/impl/tensor_sanity_check.cpp +++ b/imperative/src/impl/tensor_sanity_check.cpp @@ -26,7 +26,7 @@ TensorChecksumCalc::ChecksumResult TensorChecksumCalc::calc(TensorPtr ptr) { auto span = dt.layout().span(); megdnn::TensorND tensor; - tensor.raw_ptr = dt.raw_ptr() + span.low_byte; + tensor.reset_ptr(dt.raw_ptr() + span.low_byte); tensor.layout.init_contiguous_stride({span.dist_byte()}); tensor.layout.dtype = dtype::Byte(); diff --git a/lite/src/mge/network_impl.cpp b/lite/src/mge/network_impl.cpp index ae106921..109b3f6d 100644 --- a/lite/src/mge/network_impl.cpp +++ b/lite/src/mge/network_impl.cpp @@ -527,6 +527,10 @@ void NetworkImplDft::update_input() { config_in.lite_tensor->set_layout( to_lite_layout(in_tensor_iter.second->layout())); } + TensorHelper::implement(config_in.lite_tensor) + ->cast_final_safe() + .m_record_reset = + m_user_config->options.comp_node_seq_record_level > 0; if (config_in.config_layout.ndim && !(config_in.config_layout == config_in.lite_tensor->get_layout())) { config_in.lite_tensor->set_layout(config_in.config_layout); @@ -541,6 +545,10 @@ void NetworkImplDft::update_input() { TensorHelper::implement(io_in.lite_tensor) ->cast_final_safe() .m_host_tensor = in_tensor_iter.second; + TensorHelper::implement(io_in.lite_tensor) + ->cast_final_safe() + .m_record_reset = + m_user_config->options.comp_node_seq_record_level > 0; io_in.lite_tensor->update_from_implement(); m_network_io->inputs.push_back(io_in); } @@ -603,6 +611,10 @@ void NetworkImplDft::update_output() { } try_infer_tensor_layout(out_it->lite_tensor, var); output_tensor_copy_optimize(var, out_it->lite_tensor); + TensorHelper::implement(out_it->lite_tensor) + ->cast_final_safe() + .m_record_reset = + m_user_config->options.comp_node_seq_record_level > 0; } //! user not set, use default output } else { @@ -631,6 +643,10 @@ void NetworkImplDft::update_output() { lite_tensor = output.lite_tensor; } output_tensor_copy_optimize(out, lite_tensor); + TensorHelper::implement(lite_tensor) + ->cast_final_safe() + .m_record_reset = + m_user_config->options.comp_node_seq_record_level > 0; } } } @@ -643,14 +659,20 @@ void NetworkImplDft::output_tensor_copy_optimize( "Can't set force_output_use_user_specified_memory and " "force_output_dynamic_alloc at the same time."); if (m_user_config->options.force_output_use_user_specified_memory) { + bool in_record = m_user_config->options.comp_node_seq_record_level > 0; TensorHelper::implement(tensor) ->cast_final_safe() - .set_reset_callback([var](TensorImplDft* dft_tensor) { + .set_reset_callback([var, in_record](TensorImplDft* dft_tensor) { dft_tensor->device_share_host_memory(); auto dv = dft_tensor->dev_tensor().get(); dv->comp_node(var.node()->comp_node(), true); var.node()->init_mem_plan(dv); - var.node()->reset_dev_tensor_from_tensor(*dv); + if (in_record) { + auto&& device_tensor = var.node()->mutable_dev_tensor(); + device_tensor.only_reset_raw_storage(dv->storage()); + } else { + var.node()->reset_dev_tensor_from_tensor(*dv); + } }); } if (m_user_config->options.force_output_dynamic_alloc) { diff --git a/lite/src/mge/tensor_impl.cpp b/lite/src/mge/tensor_impl.cpp index 63d38ec7..8d502d6f 100644 --- a/lite/src/mge/tensor_impl.cpp +++ b/lite/src/mge/tensor_impl.cpp @@ -314,14 +314,22 @@ void TensorImplDft::reset(void* prepared_data) { size_t size = mge_layout.span().dist_byte(); mgb::HostTensorStorage storage; storage.reset(cn, size, raw_storage); - m_host_tensor->reset(storage, mge_layout); + if (m_record_reset) { + m_host_tensor->only_reset_raw_storage(storage); + } else { + m_host_tensor->reset(storage, mge_layout); + } } else { auto cn = m_dev_tensor->comp_node(); auto mge_layout = m_dev_tensor->layout(); size_t size = mge_layout.span().dist_byte(); mgb::DeviceTensorStorage storage; storage.reset(cn, size, raw_storage); - m_dev_tensor->reset(storage, mge_layout); + if (m_record_reset) { + m_dev_tensor->only_reset_raw_storage(storage); + } else { + m_dev_tensor->reset(storage, mge_layout); + } } if (m_reset_callback) { m_reset_callback(this); @@ -455,14 +463,9 @@ void TensorImplDft::device_share_host_memory() { m_host_tensor->comp_node(), m_host_tensor->layout()); } if (m_host_tensor->raw_ptr() != m_dev_tensor->raw_ptr()) { - auto raw_storage = std::shared_ptr( - m_host_tensor->raw_ptr(), [](void*) {}); - auto cn = m_host_tensor->comp_node(); - auto mge_layout = m_host_tensor->layout(); - size_t size = mge_layout.span().dist_byte(); - mgb::DeviceTensorStorage storage; - storage.reset(cn, size, raw_storage); - m_dev_tensor->reset(storage, mge_layout); + auto&& storage = + mgb::DeviceTensorStorage::make_proxy(m_host_tensor->storage()); + m_dev_tensor->only_reset_raw_storage(storage); } } } diff --git a/lite/src/mge/tensor_impl.h b/lite/src/mge/tensor_impl.h index a33e2bf7..06781a3f 100644 --- a/lite/src/mge/tensor_impl.h +++ b/lite/src/mge/tensor_impl.h @@ -126,6 +126,7 @@ private: void set_mge_tensor_compnode(const mgb::CompNode& comp_node); private: + bool m_record_reset = false; std::function m_get_memory_callback; std::function m_reset_callback; std::shared_ptr m_host_tensor; diff --git a/lite/test/test_network.cpp b/lite/test/test_network.cpp index 8418daf5..5763e0b9 100644 --- a/lite/test/test_network.cpp +++ b/lite/test/test_network.cpp @@ -412,9 +412,12 @@ TEST(TestNetWork, ResetOutput) { compare_lite_tensor(output_tensor, result_mgb); } -TEST(TestNetWork, OutputNoCopy) { +namespace { + +void test_output_no_copy(int record) { Config config; config.options.force_output_use_user_specified_memory = true; + config.options.comp_node_seq_record_level = record; auto tensor = get_input_data("./input_data.npy"); std::string model_path = "./shufflenet.mge"; std::string input_name = "data"; @@ -453,6 +456,65 @@ TEST(TestNetWork, OutputNoCopy) { } } +void test_input_no_copy(int record) { + Config config; + config.options.force_output_use_user_specified_memory = true; + config.options.comp_node_seq_record_level = record; + std::string model_path = "./shufflenet.mge"; + std::string input_name = "data"; + + Layout layout_in{{1, 3, 224, 224}, 4}; + std::vector> inputs; + std::vector> outputs; + for (int i = 0; i < 3; i++) { + auto tmp_in = std::make_shared(LiteDeviceType::LITE_CPU, layout_in); + + auto ptr = static_cast(tmp_in->get_memory_ptr()); + for (size_t id = 0; id < 2 * 224 * 224; id++) { + ptr[id] = i + 1; + } + inputs.push_back(tmp_in); + outputs.push_back(mgb_lar(model_path, config, input_name, tmp_in)); + } + + std::shared_ptr network = std::make_shared(config); + + network->load_model(model_path); + std::shared_ptr input_tensor = network->get_io_tensor(input_name); + std::shared_ptr output_tensor = network->get_output_tensor(0); + + for (int i = 0; i < 3; i++) { + auto ptr = inputs[i]->get_memory_ptr(); + input_tensor->reset(ptr, layout_in); + + auto tmp_out = std::make_shared( + LiteDeviceType::LITE_CPU, + Layout{{1, 1000}, 2, LiteDataType::LITE_FLOAT}); + output_tensor->reset(tmp_out->get_memory_ptr(), output_tensor->get_layout()); + + network->forward(); + network->wait(); + compare_lite_tensor(output_tensor, outputs[i]); + } +} +} // namespace + +TEST(TestNetWork, OutputNoCopy) { + test_output_no_copy(0); +} + +TEST(TestNetWork, OutputNoCopyRecord) { + test_output_no_copy(1); +} + +TEST(TestNetWork, IONoCopy) { + test_input_no_copy(0); +} + +TEST(TestNetWork, IONoCopyRecord) { + test_input_no_copy(1); +} + TEST(TestNetWork, OutputDynamicAlloc) { Config config; config.options.force_output_dynamic_alloc = true; diff --git a/src/core/impl/graph/cg_impl_seq.cpp b/src/core/impl/graph/cg_impl_seq.cpp index 8bbf755d..b146969b 100644 --- a/src/core/impl/graph/cg_impl_seq.cpp +++ b/src/core/impl/graph/cg_impl_seq.cpp @@ -250,9 +250,14 @@ std::unique_ptr ComputingGraphImpl::ComputingSequence:: "graph."); return {}; } + auto is_graph_dest_varnode = [&](VarNode* var) { + return ComputingGraphImpl::downcast(owner_graph())->var_receiver(var).size() == + 0; + }; + for (auto i : *m_opr_seq) { for (auto j : i->output()) { - if (!is_static_var_storage(j)) { + if (!is_static_var_storage(j) && !is_graph_dest_varnode(j)) { mgb_log_error( "can not enable CompNodeSeqRecorder because var " "storage not static: %s", diff --git a/src/core/impl/graph/var_node_mem_mgr.cpp b/src/core/impl/graph/var_node_mem_mgr.cpp index 76b1626f..2af9e1e6 100644 --- a/src/core/impl/graph/var_node_mem_mgr.cpp +++ b/src/core/impl/graph/var_node_mem_mgr.cpp @@ -319,7 +319,7 @@ bool VarNodeMemManager::DynamicAllocOprInfo::check_if_mem_status_change() { for (size_t i = 0; i < dev_val_input.size(); i++) { auto&& t = prev_dev_val_input[i]; auto s = dev_val_input[i]->dev_tensor().as_megdnn(); - if (t.raw_ptr != s.raw_ptr || !t.layout.eq_layout(s.layout)) { + if (t.raw_ptr() != s.raw_ptr() || !t.layout.eq_layout(s.layout)) { same = false; t = s; } diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp index 3a102b77..a58bb84e 100644 --- a/src/core/impl/tensor.cpp +++ b/src/core/impl/tensor.cpp @@ -233,6 +233,7 @@ TensorStorage& TensorStorage::operator=(const TensorStorage& rhs) m_capacity = rhs.m_capacity; m_offset = rhs.m_offset; m_data = rhs.m_data; + m_ref_ptr = rhs.m_ref_ptr; return *this; } @@ -264,7 +265,8 @@ TensorStorage TensorStorage::sub(ptrdiff_t offset) const { m_size - offset, m_capacity - offset, static_cast(toff), - m_data}; + m_data, + m_ref_ptr}; } template @@ -278,8 +280,10 @@ dt_byte* TensorStorage::apply_lazy_and_get_ptr() { mgb_throw_if(!ptr, SystemError, "failed to allocate memory"); CompNode cn = m_comp_node; m_data.reset(ptr, [cn](void* p) { Trait::free(cn, p); }); + m_ref_ptr = std::make_shared(static_cast(nullptr)); m_capacity = m_size; } + *m_ref_ptr = static_cast(m_data.get()); return m_data.get() + m_offset; } @@ -305,6 +309,19 @@ void TensorStorage::reset(CompNode node, size_t size, RawStorage data) { m_capacity = size; m_offset = 0; m_data = std::move(data); + m_ref_ptr = std::make_shared(static_cast(m_data.get())); +} + +template +void TensorStorage::only_reset_raw_storage( + CompNode node, size_t size, RawStorage data, size_t offset) { + mgb_assert(m_allow_realloc); + m_comp_node = node; + m_size = size; + m_capacity = size; + m_offset = offset; + m_data = std::move(data); + *m_ref_ptr = static_cast(m_data.get()); } template @@ -316,8 +333,8 @@ TensorStorage TensorStorage::make_proxy( "proxy source should be on CPU; got %s", src.comp_node().to_string().c_str()); src.ptr(); - return {true, src.m_comp_node, src.m_size, - src.m_capacity, src.m_offset, src.m_data}; + return {true, src.m_comp_node, src.m_size, src.m_capacity, + src.m_offset, src.m_data, src.m_ref_ptr}; } template @@ -481,6 +498,17 @@ DEF(reset, &)(TensorStorage storage, const TensorLayout& layout) { return static_cast(*this); } +DEF(only_reset_raw_storage, &)(TensorStorage storage) { + //! The storage to be reset is either satisfy the layout or empty. + //! Empty storage is used after weight preprocess for saving memory and + //! checking layout when running + mgb_assert(storage.valid_span(m_layout.span()) || storage.empty()); + m_storage.only_reset_raw_storage( + storage.comp_node(), storage.size(), storage.raw_storage(), + storage.offset()); + return static_cast(*this); +} + DEF(comp_node, &)(CompNode comp_node, bool allow_mem_node_change) { auto orig_cn = m_storage.comp_node_allow_invalid(); m_storage.comp_node(comp_node, allow_mem_node_change); diff --git a/src/core/include/megbrain/tensor.h b/src/core/include/megbrain/tensor.h index e268f3d4..d60e3f2a 100644 --- a/src/core/include/megbrain/tensor.h +++ b/src/core/include/megbrain/tensor.h @@ -226,6 +226,12 @@ public: MGE_WIN_DECLSPEC_FUC void reset(CompNode node, size_t size, RawStorage data); /*! + * \brief reset the tensor storage to given memory area + */ + MGE_WIN_DECLSPEC_FUC void only_reset_raw_storage( + CompNode node, size_t size, RawStorage data, size_t offset); + + /*! * \brief make a TensorStorage that shares memory with another * TensorStorage some different storage type * @@ -270,6 +276,11 @@ public: return m_data; } + std::shared_ptr get_ref_ptr() const { + ptr(); + return m_ref_ptr; + } + private: template friend class TensorStorage; @@ -289,16 +300,20 @@ private: RawStorage m_data; + std::shared_ptr m_ref_ptr = std::make_shared((void*)nullptr); + //! used internally for returning a predefined TensorStorage TensorStorage( bool allow_realloc, CompNode comp_node, size_t size, size_t capacity, - size_t offset, const RawStorage& data) + size_t offset, const RawStorage& data, + std::shared_ptr ref_ptr = std::make_shared((void*)nullptr)) : m_allow_realloc(allow_realloc), m_comp_node(comp_node), m_size(size), m_capacity(capacity), m_offset(offset), - m_data(data) {} + m_data(data), + m_ref_ptr(ref_ptr) {} void check_comp_node_valid() const { if (mgb_unlikely(!m_comp_node.valid())) @@ -423,6 +438,8 @@ public: MGE_WIN_DECLSPEC_FUC ChainReturnType& reset( TensorStorage storage, const TensorLayout& layout); + MGE_WIN_DECLSPEC_FUC ChainReturnType& only_reset_raw_storage(TensorStorage storage); + /* ================= getter and setters ================= */ /*! @@ -501,7 +518,8 @@ public: //! convert to megdnn::TensorND megdnn::TensorND as_megdnn() const { - return {const_cast(static_cast(raw_ptr())), m_layout}; + megdnn::RefPtr ref_ptr(m_storage.get_ref_ptr(), m_storage.offset(), false); + return {m_layout, ref_ptr}; } /* ================= misc ================= */ diff --git a/src/core/test/comp_node.cpp b/src/core/test/comp_node.cpp index 4164d9f3..1a366cb2 100644 --- a/src/core/test/comp_node.cpp +++ b/src/core/test/comp_node.cpp @@ -816,4 +816,79 @@ TYPED_TEST(TestCPUCompSeqRec, run_multi_thread_default) { } } // anonymous namespace +#include "megbrain/opr/basic_arith_wrapper.h" +#include "megbrain/opr/io.h" +#include "megbrain/opr/tensor_manip.h" +#include "megbrain/opr/utility.h" + +TEST(TestCPUCompSeqRec, run_dyn_ptr) { + CompNode cn = CompNode::load("cpux"); + + HostTensorGenerator<> gen; + auto host_x0 = gen({4, 1}, cn), host_y0 = gen({4, 1}, cn), + host_z0 = gen({4, 1}, cn); + auto host_x1 = gen({4, 1}, cn), host_y1 = gen({4, 1}, cn), + host_z1 = gen({4, 1}, cn); + + auto dev_x0 = std::make_shared(cn); + auto dev_y0 = std::make_shared(cn); + auto dev_z0 = std::make_shared(cn); + auto dev_x1 = std::make_shared(cn); + auto dev_y1 = std::make_shared(cn); + auto dev_z1 = std::make_shared(cn); + + (*dev_x0).comp_node(cn).copy_from(*host_x0).sync(); + (*dev_y0).comp_node(cn).copy_from(*host_y0).sync(); + (*dev_z0).comp_node(cn).copy_from(*host_z0).sync(); + (*dev_x1).comp_node(cn).copy_from(*host_x1).sync(); + (*dev_y1).comp_node(cn).copy_from(*host_y1).sync(); + (*dev_z1).comp_node(cn).copy_from(*host_z1).sync(); + + auto check = [&]() { + HostTensorND ret(CompNode::load("cpux"), host_x0->shape()); + auto px = host_x0->ptr(), py = host_y0->ptr(), + pz = host_z0->ptr(), pw = ret.ptr(); + auto sz0 = host_x0->shape()[0], sz1 = host_x0->shape()[1]; + + for (size_t i = 0; i < sz0; ++i) { + for (size_t j = 0; j < sz1; ++j) { + pw[i * sz1 + j] = px[i * sz1 + j] * py[i * sz1 + j] + pz[i * sz1 + j]; + } + } + return ret; + }; + + auto graph = ComputingGraph::make(); + // test record on first run + graph->options().var_sanity_check_first_run = false; + graph->options().graph_opt_level = 0; + graph->options().comp_node_seq_record_level = 1; + graph->options().fake_next_exec = true; + + auto x = opr::VolatileSharedDeviceTensor::make(*graph, dev_x0), + y = opr::VolatileSharedDeviceTensor::make(*graph, dev_y0), + z = opr::VolatileSharedDeviceTensor::make(*graph, dev_z0), + w = opr::Elemwise::make({x, y, z}, opr::Elemwise::Mode::FUSE_MUL_ADD3); + + HostTensorND host_w; + auto func = graph->compile({{w, [&host_w](DeviceTensorND& d) { + host_w = mgb::HostTensorND::make_proxy(d); + }}}); + func->execute(); + + for (int i = 0; i < 4; ++i) { + if (i == 2) { + *host_x0 = *host_x1; + *host_y0 = *host_y1; + *host_z0 = *host_z1; + dev_x0->only_reset_raw_storage(dev_x1->storage()); + dev_y0->only_reset_raw_storage(dev_y1->storage()); + dev_z0->only_reset_raw_storage(dev_z1->storage()); + } + func->execute(); + auto expect = check(); + MGB_ASSERT_TENSOR_EQ(expect, host_w) << "iter " << i; + } +} + // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/gopt/test/no_memory_copy.cpp b/src/gopt/test/no_memory_copy.cpp index 5e19f764..4bd0b53d 100644 --- a/src/gopt/test/no_memory_copy.cpp +++ b/src/gopt/test/no_memory_copy.cpp @@ -13,6 +13,7 @@ #include "./network.h" #include "megbrain/comp_node_env.h" #include "megbrain/opr/basic_arith.h" +#include "megbrain/opr/tensor_manip.h" #include "megbrain/test/helper.h" using namespace mgb; @@ -20,9 +21,11 @@ using namespace mgb; struct TestGraph { CompNode m_cn; HostTensorGenerator<> m_gen; + HostTensorGenerator m_gen_int; std::unique_ptr m_network; SymbolVar m_out_var; std::shared_ptr input_tensor; + std::shared_ptr input_tensor2; TestGraph() { m_cn = CompNode::load("cpu0"); @@ -41,6 +44,78 @@ struct TestGraph { m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); } + void create_graph_with_subtensor_forward() { + input_tensor = m_gen({2, 3, 32, 32}, m_cn); + auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) + .rename("input"); + + auto cv = [&](int v) { + auto rst = input.make_scalar(v); + return rst; + }; + + using Ad = opr::Subtensor::AxisIndexer; + auto sub = + opr::Subtensor::make(input, {Ad::make_interval(0, cv(1), cv(2), None)}); + + auto f = m_network->add_conv( + sub, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); + f = m_network->add_elemwise( + {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); + f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); + m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + } + + void create_graph_with_subtensor_relayout() { + input_tensor = m_gen({2, 3, 32, 40}, m_cn); + auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) + .rename("input"); + + auto cv = [&](int v) { + auto rst = input.make_scalar(v); + return rst; + }; + + using Ad = opr::Subtensor::AxisIndexer; + auto sub = opr::Subtensor::make( + input, {Ad::make_interval(0, cv(1), cv(2), None), + Ad::make_interval(3, cv(0), cv(32), None)}); + + auto f = m_network->add_conv( + sub, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); + f = m_network->add_elemwise( + {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); + f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); + m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + } + + void create_graph_with_setsubtensor() { + input_tensor = m_gen({1, 3, 32, 32}, m_cn); + input_tensor2 = m_gen({1, 1, 32, 32}, m_cn); + auto input = opr::Host2DeviceCopy::make(*m_network->graph, input_tensor, m_cn) + .rename("input"); + + auto input_sub = + opr::Host2DeviceCopy::make(*m_network->graph, input_tensor2, m_cn) + .rename("input2"); + + auto cv = [&](int v) { + auto rst = input.make_scalar(v); + return rst; + }; + + using Ad = opr::Subtensor::AxisIndexer; + input = opr::SetSubtensor::make( + input, input_sub, {Ad::make_interval(1, cv(1), cv(2), None)}); + + auto f = m_network->add_conv( + input, 4, {3, 3}, dtype::Float32(), true, {2, 2}, {0, 0}); + f = m_network->add_elemwise( + {f}, dtype::Float32(), opr::Elemwise::Param::Mode::EXP); + f = m_network->add_conv(f, 8, {3, 3}, dtype::Float32(), true, {1, 1}, {1, 1}); + m_out_var = m_network->add_pooling(f, {2, 2}, {2, 2}); + } + std::unique_ptr compile_without_copy() { return m_network->graph->compile({{m_out_var, nullptr}}); } @@ -51,8 +126,11 @@ struct TestGraph { } }; -TEST(TestNoCopy, BasicInputNoCopy) { +namespace { +void test_basic_input_no_copy(bool record) { auto test_graph = TestGraph(); + auto compute_graph = test_graph.m_network->graph; + compute_graph->options().comp_node_seq_record_level = record; test_graph.create_graph(); HostTensorND out, out_pre; auto func = test_graph.compile_with_copy(out); @@ -68,7 +146,11 @@ TEST(TestNoCopy, BasicInputNoCopy) { for (size_t d = 0; d < length; d++) { ptr[d] = i; } - input_tensor->reset(storage, layout); + if (record) { + input_tensor->only_reset_raw_storage(storage); + } else { + input_tensor->reset(storage, layout); + } } func->execute(); func->wait(); @@ -78,6 +160,11 @@ TEST(TestNoCopy, BasicInputNoCopy) { out_pre.copy_from(out).sync(); } } +} // namespace + +TEST(TestNoCopy, InputNoCopyPtrEQ) { + test_basic_input_no_copy(0); +} TEST(TestNoCopy, IONoCopyPtrEQ) { auto test_graph = TestGraph(); @@ -158,8 +245,112 @@ TEST(TestNoCopy, IONoCopyCorrect) { } } -TEST(TestNoCopy, InputNoCopyRecord) {} +TEST(TestNoCopy, InputNoCopyRecord) { + test_basic_input_no_copy(1); +} + +TEST(TestNoCopy, IONoCopyRecord) { + auto test_graph = TestGraph(); + auto compute_graph = test_graph.m_network->graph; + compute_graph->options().force_output_use_user_specified_memory = true; + compute_graph->options().comp_node_seq_record_level = 1; + test_graph.create_graph(); + HostTensorND truth; + auto func = test_graph.compile_without_copy(); + auto&& outvar = func->get_output_vars()[0]; + DeviceTensorND tmp(test_graph.m_cn, {1, 8, 7, 7}); + outvar->init_mem_plan(&tmp); + size_t times = 10; + for (size_t i = 0; i < times; i++) { + auto input_tensor = test_graph.input_tensor; + auto layout = input_tensor->layout(); + size_t length = layout.total_nr_elems(); + auto storage = TensorStorage(test_graph.m_cn); + storage.ensure_size(length * sizeof(float)); + float* ptr = storage.ptr()->as(); + for (size_t d = 0; d < length; d++) { + ptr[d] = i / 5 + 3; + } + input_tensor->only_reset_raw_storage(storage); + DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); + dv.raw_ptr(); + + auto& dev_tensor = outvar->mutable_dev_tensor(); + dev_tensor.only_reset_raw_storage(dv.storage()); + + func->execute(); + func->wait(); + if (i % 5 == 0) { + truth.copy_from(dv).sync(); + continue; + } + HostTensorND to_check; + to_check.copy_from(dv).sync(); + MGB_ASSERT_TENSOR_EQ(to_check, truth); + } +} + +namespace { +void test_subtensor_record(int level) { + auto test_graph = TestGraph(); + auto compute_graph = test_graph.m_network->graph; + compute_graph->options().force_output_use_user_specified_memory = true; + compute_graph->options().comp_node_seq_record_level = 1; + if (level == 2) { + test_graph.create_graph_with_setsubtensor(); + } else if (level == 1) { + test_graph.create_graph_with_subtensor_forward(); + } else { + test_graph.create_graph_with_subtensor_relayout(); + } + HostTensorND truth; + auto func = test_graph.compile_without_copy(); + auto&& outvar = func->get_output_vars()[0]; + DeviceTensorND tmp(test_graph.m_cn, {1, 8, 7, 7}); + outvar->init_mem_plan(&tmp); + size_t times = 10; + for (size_t i = 0; i < times; i++) { + auto input_tensor = test_graph.input_tensor; + auto layout = input_tensor->layout(); + size_t length = layout.total_nr_elems(); + auto storage = TensorStorage(test_graph.m_cn); + storage.ensure_size(length * sizeof(float)); + float* ptr = storage.ptr()->as(); + for (size_t d = 0; d < length; d++) { + ptr[d] = i / 5 + 3; + } + input_tensor->only_reset_raw_storage(storage); + DeviceTensorND dv(test_graph.m_cn, {1, 8, 7, 7}); + dv.raw_ptr(); + + auto& dev_tensor = outvar->mutable_dev_tensor(); + dev_tensor.only_reset_raw_storage(dv.storage()); + + func->execute(); + func->wait(); + if (i % 5 == 0) { + truth.copy_from(dv).sync(); + continue; + } + HostTensorND to_check; + to_check.copy_from(dv).sync(); + MGB_ASSERT_TENSOR_EQ(to_check, truth); + } +} +} // namespace + +TEST(TestNoCopy, IONoCopyRecordSubTensor) { + test_subtensor_record(0); +} + +TEST(TestNoCopy, IONoCopyRecordSubTensorRelayout) { + test_subtensor_record(1); +} -TEST(TestNoCopy, OutputNoCopyRecord) {} +//! TODO: the test should fix compnode memory copy, which now not record reference +//! ptr, when support it, the test will pass +/*TEST(TestNoCopy, IONoCopyRecordSetSubTensor) { + test_subtensor_record(2); +}*/ // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}} diff --git a/src/jit/impl/nvrtc/compiler_cuda.cpp b/src/jit/impl/nvrtc/compiler_cuda.cpp index 6f9d1f58..df5d8399 100644 --- a/src/jit/impl/nvrtc/compiler_cuda.cpp +++ b/src/jit/impl/nvrtc/compiler_cuda.cpp @@ -133,7 +133,7 @@ void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_ host_init_pvisitor(pvisitors[i], args.inputs[i].layout); } datum[nr_inps] = reinterpret_cast( - args.outputs[0].from->dev_tensor().as_megdnn().raw_ptr); + args.outputs[0].from->dev_tensor().as_megdnn().raw_ptr()); size_t num_elements = args.outputs[0].layout.total_nr_elems(); mgb_assert( num_elements <= UINT32_MAX, @@ -152,11 +152,11 @@ void setup_and_launch(const JITExecutor* fusion_opr, CUfunction func, int block_ exec_args[0] = datum.data(); exec_args[2] = pvisitors.data(); } else { - datum_dev = args.outputs[1].from->dev_tensor().as_megdnn().raw_ptr; + datum_dev = args.outputs[1].from->dev_tensor().as_megdnn().raw_ptr(); MGB_CUDA_CHECK(cudaMemcpyAsync( datum_dev, datum.data(), (nr_inps + 1) * sizeof(CUdeviceptr), cudaMemcpyHostToDevice, env.cuda_env().stream)); - p_visitors_dev = args.outputs[2].from->dev_tensor().as_megdnn().raw_ptr; + p_visitors_dev = args.outputs[2].from->dev_tensor().as_megdnn().raw_ptr(); MGB_CUDA_CHECK(cudaMemcpyAsync( p_visitors_dev, pvisitors.data(), nr_inps * sizeof(ParamElemVisitor), cudaMemcpyHostToDevice, diff --git a/src/opr/impl/basic_arith.cpp b/src/opr/impl/basic_arith.cpp index 77ff4772..3d05f401 100644 --- a/src/opr/impl/basic_arith.cpp +++ b/src/opr/impl/basic_arith.cpp @@ -1269,7 +1269,9 @@ void Reduce::KernScheduler::update_ptr( mgb_assert( dest.shape().total_nr_elems() == m_kern_param.back().output.layout.total_nr_elems()); - m_kern_param[0].input.raw_ptr = const_cast(input.raw_ptr()); + auto in_tensor = input.as_megdnn(); + in_tensor.layout = m_kern_param[0].input.layout; + m_kern_param[0].input = in_tensor; dt_byte *workspace_begin = workspace_size() ? const_cast(workspace.raw_ptr()) @@ -1280,12 +1282,14 @@ void Reduce::KernScheduler::update_ptr( *kern_workspace = workspace_begin + m_workspace_spec[2].offset; for (size_t i = 0; i < m_kern_param.size() - 1; ++i) { auto optr = tmp_reduce_ptr[i % 2]; - m_kern_param[i].output.raw_ptr = optr; - m_kern_param[i + 1].input.raw_ptr = optr; + m_kern_param[i].output.reset_ptr(optr); + m_kern_param[i + 1].input.reset_ptr(optr); } for (auto&& i : m_kern_param) i.workspace.raw_ptr = kern_workspace; - m_kern_param.back().output.raw_ptr = const_cast(dest.raw_ptr()); + auto out_tensor = dest.as_megdnn(); + out_tensor.layout = m_kern_param.back().output.layout; + m_kern_param.back().output = out_tensor; } void Reduce::KernScheduler::execute( @@ -1343,8 +1347,8 @@ void Reduce::KernScheduler::execute( } mgb_assert( input.layout().is_contiguous() && - input.raw_ptr() == m_kern_param[0].input.raw_ptr && - dest.raw_ptr() == m_kern_param.back().output.raw_ptr); + input.raw_ptr() == m_kern_param[0].input.raw_ptr() && + dest.raw_ptr() == m_kern_param.back().output.raw_ptr()); for (auto&& i : m_kern_param) { opr->param() = i.KernParam::kparam; opr->exec(i.input, i.output, i.workspace); diff --git a/src/opr/impl/cond.cpp b/src/opr/impl/cond.cpp index 38d43a6c..28a73c14 100644 --- a/src/opr/impl/cond.cpp +++ b/src/opr/impl/cond.cpp @@ -1157,7 +1157,7 @@ void CondExecMerge::scn_do_execute() { if (forwarded[oidx]) { ovar->shape_alloc(ovar->shape()); auto&& own_dest = ovar->dev_tensor().as_megdnn(); - mgb_assert(own_dest.raw_ptr != dest.raw_ptr); + mgb_assert(own_dest.raw_ptr() != dest.raw_ptr()); dnn_opr->exec({dest, src}, own_dest); forwarded[oidx] = false; } else { diff --git a/src/opr/impl/misc.cpp b/src/opr/impl/misc.cpp index e5983f19..584ecb5e 100644 --- a/src/opr/impl/misc.cpp +++ b/src/opr/impl/misc.cpp @@ -241,9 +241,9 @@ void NvOf::scn_do_execute() { } nv_flow_extractor->extract_flow( - static_cast(input(0)->dev_tensor().as_megdnn().raw_ptr), + static_cast(input(0)->dev_tensor().as_megdnn().raw_ptr()), vshape, - reinterpret_cast(output(0)->dev_tensor().as_megdnn().raw_ptr)); + reinterpret_cast(output(0)->dev_tensor().as_megdnn().raw_ptr())); } void NvOf::init_output_static_infer_desc() { diff --git a/src/opr/impl/tensor_manip.cpp b/src/opr/impl/tensor_manip.cpp index a9a9959a..cfac8d64 100644 --- a/src/opr/impl/tensor_manip.cpp +++ b/src/opr/impl/tensor_manip.cpp @@ -1425,7 +1425,7 @@ void ParamPackConcat::scn_do_execute() { m_inp_ptr.resize(inputs.size() - 1); auto ptr = m_inp_ptr.data(); for (size_t i = 0; i < inputs.size() - 1; i++) { - ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr; + ptr[i] = inputs[i]->dev_tensor().as_megdnn().raw_ptr(); } auto offsets = inputs.back()->dev_tensor().as_megdnn(); megdnn::TensorND srcs( diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 0da9a24a..00d5a6ff 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -2572,8 +2572,8 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { ASSERT_EQ(pf->tensors.size(), 2); ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4})); ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8})); - ASSERT_NE(pf->tensors[0].raw_ptr, nullptr); - ASSERT_NE(pf->tensors[1].raw_ptr, nullptr); + ASSERT_NE(pf->tensors[0].raw_ptr(), nullptr); + ASSERT_NE(pf->tensors[1].raw_ptr(), nullptr); pf->tensors[0].ptr()[0] = 114.514f; pf->tensors[1].ptr()[0] = 1926.0817f; })); diff --git a/src/plugin/impl/var_sanity_check.cpp b/src/plugin/impl/var_sanity_check.cpp index 11801af6..f4c3c685 100644 --- a/src/plugin/impl/var_sanity_check.cpp +++ b/src/plugin/impl/var_sanity_check.cpp @@ -178,7 +178,7 @@ VarSanityCheck::ChecksumResult VarSanityCheck::calc_checksum(VarNode* var) { auto span = dt.layout().span(); megdnn::TensorND tensor; - tensor.raw_ptr = dt.raw_ptr() + span.low_byte; + tensor.reset_ptr(dt.raw_ptr() + span.low_byte); tensor.layout.init_contiguous_stride({span.dist_byte()}); tensor.layout.dtype = dtype::Byte();