GitOrigin-RevId: 2b7313ebe3
release-1.11
@@ -25,7 +25,11 @@ class OprParamsLoadContext final : public serialization::OprLoadContextRawPOD { | |||
std::shared_ptr<HostTensorND> load_tensor() override { mgb_assert(0); } | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared() override { mgb_assert(0); } | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared( | |||
bool copy_immediatly = false) override { | |||
(void)copy_immediatly; | |||
mgb_assert(0); | |||
} | |||
const serialization::GraphLoadConfig& config() const override { mgb_assert(0); } | |||
@@ -245,9 +245,6 @@ int LITE_destroy_network(LiteNetwork network) { | |||
auto& global_holder = get_gloabl_network_holder(); | |||
if (global_holder.find(network) != global_holder.end()) { | |||
global_holder.erase(network); | |||
} else { | |||
//! means the network has been destoryed | |||
return -1; | |||
} | |||
LITE_CAPI_END(); | |||
} | |||
@@ -75,9 +75,6 @@ int LITE_destroy_tensor(LiteTensor tensor) { | |||
auto& global_holder = get_global_tensor_holder(); | |||
if (global_holder.find(tensor) != global_holder.end()) { | |||
global_holder.erase(tensor); | |||
} else { | |||
//! return -1, means the tensor has been destroyed. | |||
return -1; | |||
} | |||
LITE_CAPI_END(); | |||
} | |||
@@ -16,26 +16,8 @@ void ModelLite::create_network() { | |||
} | |||
void ModelLite::load_model() { | |||
if (share_model_mem) { | |||
//! WARNNING:maybe not right to share param memmory for this | |||
LITE_LOG("enable share model memory"); | |||
FILE* fin = fopen(model_path.c_str(), "rb"); | |||
LITE_ASSERT(fin, "failed to open %s: %s", model_path.c_str(), strerror(errno)); | |||
fseek(fin, 0, SEEK_END); | |||
size_t size = ftell(fin); | |||
fseek(fin, 0, SEEK_SET); | |||
void* ptr = malloc(size); | |||
std::shared_ptr<void> buf{ptr, free}; | |||
auto nr = fread(buf.get(), 1, size, fin); | |||
LITE_ASSERT(nr == size, "read model file failed"); | |||
fclose(fin); | |||
m_network->load_model(buf.get(), size); | |||
} else { | |||
m_network->load_model(model_path); | |||
} | |||
//! lite shared memory default | |||
m_network->load_model(model_path); | |||
} | |||
void ModelLite::run_model() { | |||
@@ -128,7 +128,10 @@ std::shared_ptr<void> ModelParser::decrypt_memory( | |||
const uint8_t* memory_ptr = data; | |||
if (decryption_name == "NONE") { | |||
result_length = length; | |||
return std::shared_ptr<void>(const_cast<uint8_t*>(memory_ptr), [](void*) {}); | |||
std::shared_ptr<uint8_t> shptr{ | |||
new uint8_t[length], [](uint8_t* p) { delete[] p; }}; | |||
memcpy(shptr.get(), data, length); | |||
return shptr; | |||
} | |||
LITE_LOCK_GUARD(decryption_static_data().map_mutex); | |||
auto it = decryption_static_data().decryption_methods.find(decryption_name); | |||
@@ -1032,7 +1032,7 @@ TEST(TestCapiNetWork, GlobalHolder) { | |||
LITE_make_network(&c_network, *default_config(), *default_network_io())); | |||
//! make sure destroy_network is destroyed by LITE_make_network | |||
LITE_destroy_network(destroy_network); | |||
ASSERT_EQ(LITE_destroy_network(destroy_network), -1); | |||
ASSERT_EQ(LITE_destroy_network(destroy_network), 0); | |||
LITE_CAPI_CHECK(LITE_destroy_network(c_network)); | |||
} | |||
@@ -328,7 +328,7 @@ TEST(TestCapiTensor, GlobalHolder) { | |||
LITE_make_tensor(description, &c_tensor0); | |||
//! make sure destroy_tensor is destroyed by LITE_make_tensor | |||
LITE_destroy_tensor(destroy_tensor); | |||
ASSERT_EQ(LITE_destroy_tensor(destroy_tensor), -1); | |||
ASSERT_EQ(LITE_destroy_tensor(destroy_tensor), 0); | |||
LITE_destroy_tensor(c_tensor0); | |||
} | |||
@@ -332,6 +332,7 @@ class TensorND { | |||
public: | |||
using ChainReturnType = TensorND<TensorStorage>; | |||
using Storage = TensorStorage; | |||
MGE_WIN_DECLSPEC_FUC TensorND(); | |||
@@ -443,38 +443,41 @@ void run<shape_dep_const_shape>(CompNode cn) { | |||
HostTensorGenerator<> gen; | |||
auto host_x = gen({4, 5}, cn); | |||
auto fname = output_file("test_comp_node_record_shape_dep_const_shape"); | |||
auto test = [&](serialization::GraphDumpFormat format) { | |||
HostTensorND y_expect; | |||
{ | |||
// dump graph | |||
auto graph = ComputingGraph::make(); | |||
auto x = opr::Host2DeviceCopy::make( | |||
*graph, host_x, OperatorNodeConfig{"x"}), | |||
y = x.flatten() + | |||
opr::reduce_sum(opr::GetVarShape::make(x), x.make_scalar(1)); | |||
HostTensorND y_expect; | |||
{ | |||
// dump graph | |||
auto graph = ComputingGraph::make(); | |||
auto x = opr::Host2DeviceCopy::make(*graph, host_x, OperatorNodeConfig{"x"}), | |||
y = x.flatten() + | |||
opr::reduce_sum(opr::GetVarShape::make(x), x.make_scalar(1)); | |||
graph->compile({make_callback_copy(y, y_expect)})->execute(); | |||
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str())); | |||
dumper->dump({y}); | |||
} | |||
graph->compile({make_callback_copy(y, y_expect)})->execute(); | |||
HostTensorND host_y; | |||
{ | |||
GraphLoadConfig config; | |||
config.const_var_shape = true; | |||
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str())); | |||
auto load_rst = loader->load(config); | |||
load_rst.graph->options().comp_node_seq_record_level = 2; | |||
load_rst.graph->options().var_sanity_check_first_run = false; | |||
auto x_inp = load_rst.tensor_map.at("x"); | |||
auto y = load_rst.output_var_list.at(0); | |||
auto func = load_rst.graph_compile({make_callback_copy(y, host_y)}); | |||
x_inp->copy_from(*host_x); | |||
func->execute(); | |||
} | |||
auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()), format); | |||
dumper->dump({y}); | |||
} | |||
MGB_ASSERT_TENSOR_EQ(y_expect, host_y); | |||
HostTensorND host_y; | |||
{ | |||
GraphLoadConfig config; | |||
config.const_var_shape = true; | |||
auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()), format); | |||
auto load_rst = loader->load(config); | |||
load_rst.graph->options().comp_node_seq_record_level = 2; | |||
load_rst.graph->options().var_sanity_check_first_run = false; | |||
auto x_inp = load_rst.tensor_map.at("x"); | |||
auto y = load_rst.output_var_list.at(0); | |||
auto func = load_rst.graph_compile({make_callback_copy(y, host_y)}); | |||
x_inp->copy_from(*host_x); | |||
func->execute(); | |||
} | |||
MGB_ASSERT_TENSOR_EQ(y_expect, host_y); | |||
}; | |||
test({}); | |||
test(serialization::GraphDumpFormat::FLATBUFFERS_V2); | |||
} | |||
//! single thread multi recorder run interleave | |||
@@ -367,16 +367,19 @@ MGB_DYN_TYPE_OBJ_FINAL_IMPL(ImmutableTensor); | |||
class ImmutableTensor::Value { | |||
MGB_MUTEX m_mtx; | |||
DeviceTensorND m_dev, m_static_infer; | |||
std::shared_ptr<DeviceTensorND> m_dev = std::make_shared<DeviceTensorND>(); | |||
DeviceTensorND m_static_infer; | |||
std::string m_summary; | |||
public: | |||
void setup(CompNode cn, const HostTensorND& val); | |||
bool initialized() const { return m_dev.shape_valid(); } | |||
void setup(std::shared_ptr<DeviceTensorND> val); | |||
bool initialized() const { return m_dev->shape_valid(); } | |||
//! value on comp node | |||
const DeviceTensorND& dev() const { return m_dev; } | |||
const DeviceTensorND& dev() const { return *m_dev; } | |||
//! get value on static infer CPU node | |||
DeviceTensorND& static_infer(); | |||
@@ -385,10 +388,17 @@ public: | |||
const std::string& summary() const { return m_summary; } | |||
}; | |||
void ImmutableTensor::Value::setup(std::shared_ptr<DeviceTensorND> val) { | |||
mgb_assert(val); | |||
m_dev = val; | |||
m_summary = ssprintf("const%s", val->shape().to_string().c_str()); | |||
} | |||
void ImmutableTensor::Value::setup(CompNode cn, const HostTensorND& val) { | |||
mgb_assert(m_dev.empty() && !m_dev.shape_valid()); | |||
m_dev.comp_node(cn).copy_from(val).sync(); | |||
mgb_assert(val.empty() == m_dev.empty()); | |||
mgb_assert(m_dev->empty() && !m_dev->shape_valid()); | |||
m_dev->comp_node(cn).copy_from(val).sync(); | |||
mgb_assert(val.empty() == m_dev->empty()); | |||
auto one_elem = [](const TensorShape& shape) { | |||
for (size_t i = 0; i < shape.ndim; ++i) { | |||
@@ -413,8 +423,8 @@ void ImmutableTensor::Value::setup(CompNode cn, const HostTensorND& val) { | |||
DeviceTensorND& ImmutableTensor::Value::static_infer() { | |||
MGB_LOCK_GUARD(m_mtx); | |||
if (!m_static_infer.shape_valid()) { | |||
mgb_assert(m_dev.shape_valid()); | |||
m_static_infer.comp_node(CompNode::default_cpu()).copy_from(m_dev); | |||
mgb_assert(m_dev->shape_valid()); | |||
m_static_infer.comp_node(CompNode::default_cpu()).copy_from(*m_dev); | |||
} | |||
return m_static_infer; | |||
} | |||
@@ -589,6 +599,19 @@ SymbolVar ImmutableTensor::make( | |||
} | |||
SymbolVar ImmutableTensor::make( | |||
ComputingGraph& graph, std::shared_ptr<DeviceTensorND> val, | |||
const OperatorNodeConfig& config) { | |||
auto cn = val->comp_node(); | |||
if (config.has_comp_node_set()) | |||
cn = config.get_single_comp_node(); | |||
auto value = std::make_shared<Value>(); | |||
value->setup(val); | |||
return make_from_value(graph, *value, value, config); | |||
} | |||
SymbolVar ImmutableTensor::make( | |||
ComputingGraph& graph, const DTypeScalar& val, | |||
const OperatorNodeConfig& config) { | |||
mgb_assert( | |||
@@ -132,8 +132,10 @@ struct OprLoadDumpImpl<opr::ImmutableTensor, 0> { | |||
OprLoadContext& ctx, const cg::VarNodeArray& inputs, | |||
const OperatorNodeConfig& config) { | |||
mgb_assert(inputs.empty()); | |||
auto val = ctx.load_tensor(); | |||
return Opr::make(ctx.graph(), *val, config).node()->owner_opr(); | |||
//! because ImmutableTensor will used in infer shape or infer value, | |||
//! so must copy immediatly | |||
auto val = ctx.load_tensor_shared(true); | |||
return Opr::make(ctx.graph(), val, config).node()->owner_opr(); | |||
} | |||
}; | |||
@@ -32,8 +32,10 @@ struct OprLoadDumpImplV2<opr::ImmutableTensor, 0> { | |||
auto fopr = reinterpret_cast<const fbs::v2::Operator*>( | |||
fbs_ctx.get_current_opr_data()); | |||
if (fopr->tensors() && fopr->tensors()->size() > 0) { | |||
auto val = fbs_ctx.load_tensor(); | |||
return Opr::make(fbs_ctx.graph(), *val, config).node()->owner_opr(); | |||
//! because ImmutableTensor will used in infer shape or infer value, | |||
//! so must copy immediatly | |||
auto val = fbs_ctx.load_tensor_shared(true); | |||
return Opr::make(fbs_ctx.graph(), val, config).node()->owner_opr(); | |||
} else { | |||
mgb_throw(SerializationError, "ImmutableTensor load with no tensor data."); | |||
} | |||
@@ -360,6 +360,10 @@ public: | |||
ComputingGraph& graph, const HostTensorND& val, | |||
const OperatorNodeConfig& config = {}); | |||
MGE_WIN_DECLSPEC_FUC static SymbolVar make( | |||
ComputingGraph& graph, std::shared_ptr<DeviceTensorND> val, | |||
const OperatorNodeConfig& config = {}); | |||
//! make from DTypeScalar; comp node must be provided in config | |||
MGE_WIN_DECLSPEC_FUC static SymbolVar make( | |||
ComputingGraph& graph, const DTypeScalar& val, | |||
@@ -138,6 +138,10 @@ public: | |||
mgb_assert(m_refhold && size); | |||
} | |||
bool is_shared_memory() override { return true; } | |||
bool writable() override { return m_writable; } | |||
void have_modified() override { m_modified = true; } | |||
void rewind() override { | |||
if (m_modified) { | |||
// data has beem modified; can not read again | |||
@@ -63,7 +63,11 @@ class OprLoadContextMemory final : public OprLoadContextRawPOD { | |||
std::shared_ptr<HostTensorND> load_tensor() override { mgb_assert(0); } | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared() override { mgb_assert(0); } | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared( | |||
bool copy_immediatly = false) override { | |||
(void)copy_immediatly; | |||
mgb_assert(0); | |||
} | |||
const GraphLoadConfig& config() const override { | |||
mgb_throw(GraphError, "OprLoadContextMemory has no associated config"); | |||
@@ -483,7 +483,8 @@ class GraphLoaderOSS::OprLoadContextImpl final : public OprLoadContextFlatBuffer | |||
std::shared_ptr<HostTensorND> load_tensor() override; | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared() override; | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared( | |||
bool copy_immediatly = false) override; | |||
void load_single_opr(const fbs::Operator* opr); | |||
@@ -641,8 +642,8 @@ std::shared_ptr<HostTensorND> GraphLoaderOSS::OprLoadContextImpl::load_tensor() | |||
return ret; | |||
} | |||
std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl:: | |||
load_tensor_shared() { | |||
std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl::load_tensor_shared( | |||
bool copy_immediatly) { | |||
mgb_assert( | |||
m_current_opr->tensors() && | |||
m_cur_opr_tensor_cnt < m_current_opr->tensors()->size()); | |||
@@ -650,6 +651,9 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl:: | |||
auto comp_node = load_comp_node(tensor->comp_node()); | |||
auto layout = load_tensor_layout(tensor); | |||
mgb_assert(tensor->data_size()); | |||
if (m_loader->m_shared_tensor_map.size() <= m_cur_shared_tensor_idx) { | |||
m_loader->m_shared_tensor_map.resize(m_cur_shared_tensor_idx + 5); | |||
} | |||
auto&& sh_reg = m_loader->m_shared_tensor_map.at(m_cur_shared_tensor_idx++); | |||
auto&& sh_ptr_ref = sh_reg.second[comp_node.mem_node()]; | |||
if (sh_ptr_ref) { | |||
@@ -673,6 +677,11 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSS::OprLoadContextImpl:: | |||
load_tensor_value(&hv, layout, tensor); | |||
sh_ptr_ref = std::make_shared<DeviceTensorND>(); | |||
*sh_ptr_ref = DeviceTensorND::make_proxy(hv); | |||
} else if (copy_immediatly) { | |||
HostTensorND hv{CompNode::default_cpu()}; | |||
load_tensor_value(&hv, layout, tensor); | |||
sh_ptr_ref = std::make_shared<DeviceTensorND>(); | |||
sh_ptr_ref->comp_node(comp_node).copy_from(hv).sync(); | |||
} else { | |||
// use lazy load for non-CPU devices | |||
HostTensorND hv{CompNode::default_cpu()}; | |||
@@ -803,7 +812,7 @@ GraphLoader::LoadResult GraphLoaderOSS::OprLoadContextImpl::load_oprs() { | |||
ret.output_var_map_id[out->original_id()] = var; | |||
ret.output_var_list[i] = var; | |||
} | |||
mgb_assert(m_cur_shared_tensor_idx == m_loader->m_shared_tensor_map.size()); | |||
mgb_assert(m_cur_shared_tensor_idx <= m_loader->m_shared_tensor_map.size()); | |||
return ret; | |||
} | |||
@@ -880,7 +889,7 @@ GraphLoader::LoadResult GraphLoaderOSS::load(const LoadConfig& config, bool rewi | |||
if (m_shared_tensor_map.empty()) { | |||
m_shared_tensor_map.resize(m_graph->nr_shared_tensor()); | |||
} else { | |||
mgb_assert(m_shared_tensor_map.size() == m_graph->nr_shared_tensor()); | |||
mgb_assert(m_shared_tensor_map.size() >= m_graph->nr_shared_tensor()); | |||
} | |||
OprLoadContextImpl ctx{this, m_graph->mgb_version()}; | |||
@@ -1,5 +1,6 @@ | |||
#if MGB_ENABLE_FBS_SERIALIZATION | |||
#include <map> | |||
#include "megbrain/comp_node_env.h" | |||
#include "megbrain/opr/io.h" | |||
#include "megbrain/serialization/helper.h" | |||
@@ -523,6 +524,77 @@ void GraphDumperOSSV2::dump_buf_with_len(const void* data, uint32_t size) { | |||
} | |||
// ----------------------------- Loader -------------------------------------- | |||
/** | |||
* SharedTensorAlignMent will record all shared device tensors, at beginning, the | |||
* tensor is not aligned, after all shared device tensor loaded, and the user | |||
* provide memory will be wrote, and reorder all the tensor to aligned address | |||
* ptr. | |||
*/ | |||
class GraphLoaderOSSV2::SharedTensorAlignMent { | |||
public: | |||
SharedTensorAlignMent(SharedBuffer buffer, InputFile* file, bool is_enabled) | |||
: m_enabled(is_enabled), m_file(file), m_model_buffer(buffer){}; | |||
bool add_device_tensor(std::shared_ptr<DeviceTensorND> tensor) { | |||
if (!m_enabled) | |||
return false; | |||
if (tensor) { | |||
m_device_tensors[reinterpret_cast<intptr_t>(tensor->raw_ptr())] = tensor; | |||
return true; | |||
} | |||
return false; | |||
} | |||
/** | |||
* record the tensor shared from the m_model_buffer, copy every tensor to | |||
* the aligned address, then the model file will be modilfied, so it can't | |||
* use again. | |||
*/ | |||
bool reorder_and_align_tensor() { | |||
if (!m_enabled) | |||
return false; | |||
bool modilfied = false; | |||
intptr_t buffer_start = reinterpret_cast<intptr_t>(m_model_buffer.data()); | |||
intptr_t write_end = buffer_start; | |||
for (auto& iter : m_device_tensors) { | |||
auto& tensor = iter.second; | |||
size_t tensor_size = tensor->layout().span().dist_byte(); | |||
size_t alignment = tensor->comp_node().get_mem_addr_alignment(); | |||
intptr_t tensor_start = reinterpret_cast<intptr_t>(tensor->raw_ptr()); | |||
intptr_t align_start = static_cast<intptr_t>( | |||
reinterpret_cast<uintptr_t>(tensor->raw_ptr()) & ~(alignment - 1)); | |||
if (align_start > write_end) { | |||
if (tensor_start != align_start) { | |||
memmove(reinterpret_cast<void*>(align_start), | |||
reinterpret_cast<void*>(tensor_start), tensor_size); | |||
modilfied = true; | |||
} | |||
write_end = align_start + tensor_size; | |||
DeviceTensorStorage storage; | |||
auto raw_storage = std::shared_ptr<mgb::dt_byte>( | |||
reinterpret_cast<mgb::dt_byte*>(align_start), [](void*) {}); | |||
storage.reset(tensor->comp_node(), tensor_size, raw_storage); | |||
tensor->reset(storage, tensor->layout()); | |||
} else { | |||
DeviceTensorND new_tensor(tensor->comp_node()); | |||
new_tensor.copy_from(*tensor).sync(); | |||
*tensor = new_tensor; | |||
} | |||
if (modilfied) { | |||
m_file->have_modified(); | |||
} | |||
} | |||
return true; | |||
} | |||
private: | |||
bool m_enabled = false; | |||
InputFile* m_file; | |||
SharedBuffer m_model_buffer; | |||
std::map<intptr_t, std::shared_ptr<DeviceTensorND>> m_device_tensors; | |||
}; | |||
CompNode GraphLoaderOSSV2::OprLoadContextImpl::load_comp_node( | |||
const fbs::v2::CompNode* comp_node) { | |||
mgb_assert(comp_node); | |||
@@ -596,7 +668,9 @@ std::shared_ptr<HostTensorND> GraphLoaderOSSV2::OprLoadContextImpl::load_tensor( | |||
"serialization v2 format is pure flatbuffer format, not support " | |||
"user tensor value loader callback."); | |||
} | |||
memcpy(ret->raw_ptr(), tensor->data()->data(), tensor->data()->size()); | |||
fill_tensor_memory( | |||
*ret, tensor->data()->data(), tensor->data()->size(), | |||
m_loader->m_file->is_shared_memory()); | |||
} | |||
if (tensor->name()) { | |||
m_tensor_map[tensor->name()->str()] = ret; | |||
@@ -612,7 +686,7 @@ std::shared_ptr<HostTensorND> GraphLoaderOSSV2::OprLoadContextImpl::load_tensor( | |||
} | |||
std::shared_ptr<DeviceTensorND> GraphLoaderOSSV2::OprLoadContextImpl:: | |||
load_tensor_shared() { | |||
load_tensor_shared(bool copy_immediatly) { | |||
mgb_assert( | |||
m_current_opr->tensors() && | |||
m_cur_opr_tensor_cnt < m_current_opr->tensors()->size()); | |||
@@ -620,6 +694,9 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSSV2::OprLoadContextImpl:: | |||
auto comp_node = load_comp_node(tensor->comp_node()); | |||
auto layout = load_tensor_layout(tensor, comp_node); | |||
mgb_assert(tensor->data()); | |||
if (m_loader->m_shared_tensor_map.size() <= m_cur_shared_tensor_idx) { | |||
m_loader->m_shared_tensor_map.resize(m_cur_shared_tensor_idx + 5); | |||
} | |||
auto&& shared_pair = m_loader->m_shared_tensor_map.at(m_cur_shared_tensor_idx++); | |||
auto&& shared_tensor_ref = shared_pair.second[comp_node.mem_node()]; | |||
if (shared_tensor_ref) { | |||
@@ -637,19 +714,34 @@ std::shared_ptr<DeviceTensorND> GraphLoaderOSSV2::OprLoadContextImpl:: | |||
if (comp_node.mem_node() == CompNode::default_cpu().mem_node()) { | |||
// directly forward CPU memory | |||
shared_tensor_ref = std::make_shared<DeviceTensorND>(); | |||
HostTensorND hv{comp_node}; | |||
if (tensor->data() && tensor->data()->size() > 0) { | |||
hv.dtype(layout.dtype).resize(layout); | |||
memcpy(hv.raw_ptr(), tensor->data()->data(), tensor->data()->size()); | |||
fill_tensor_memory( | |||
hv, tensor->data()->data(), tensor->data()->size(), | |||
m_loader->m_file->is_shared_memory()); | |||
} | |||
shared_tensor_ref = std::make_shared<DeviceTensorND>(); | |||
*shared_tensor_ref = DeviceTensorND::make_proxy(hv); | |||
m_tensor_alignment->add_device_tensor(shared_tensor_ref); | |||
} else if (copy_immediatly) { | |||
HostTensorND hv{CompNode::default_cpu()}; | |||
shared_tensor_ref = std::make_shared<DeviceTensorND>(); | |||
if (tensor->data() && tensor->data()->size() > 0) { | |||
hv.dtype(layout.dtype).resize(layout); | |||
fill_tensor_memory( | |||
hv, tensor->data()->data(), tensor->data()->size(), | |||
m_loader->m_file->is_shared_memory()); | |||
} | |||
shared_tensor_ref->comp_node(comp_node).copy_from(hv).sync(); | |||
} else { | |||
// use lazy load for non-CPU devices | |||
HostTensorND hv{CompNode::default_cpu()}; | |||
if (tensor->data() && tensor->data()->size() > 0) { | |||
hv.dtype(layout.dtype).resize(layout); | |||
memcpy(hv.raw_ptr(), tensor->data()->data(), tensor->data()->size()); | |||
fill_tensor_memory( | |||
hv, tensor->data()->data(), tensor->data()->size(), | |||
m_loader->m_file->is_shared_memory()); | |||
} | |||
shared_tensor_ref = m_device_value_loader.make(comp_node, std::move(hv)); | |||
} | |||
@@ -784,7 +876,7 @@ GraphLoader::LoadResult GraphLoaderOSSV2::OprLoadContextImpl::load_oprs() { | |||
ret.output_var_map_id[out->original_id()] = var; | |||
ret.output_var_list[i] = var; | |||
} | |||
mgb_assert(m_cur_shared_tensor_idx == m_loader->m_shared_tensor_map.size()); | |||
mgb_assert(m_cur_shared_tensor_idx <= m_loader->m_shared_tensor_map.size()); | |||
return ret; | |||
} | |||
@@ -808,7 +900,6 @@ GraphLoader::LoadResult GraphLoaderOSSV2::load(const LoadConfig& config, bool re | |||
m_file->read(&size, sizeof(size)); | |||
m_file->skip(-sizeof(size)); | |||
m_model_buf = m_file->read_shared(size + sizeof(size)); | |||
{ | |||
flatbuffers::Verifier verifier( | |||
static_cast<const uint8_t*>(m_model_buf.data()), m_model_buf.size()); | |||
@@ -838,8 +929,10 @@ GraphLoader::LoadResult GraphLoaderOSSV2::load(const LoadConfig& config, bool re | |||
} else { | |||
mgb_assert(m_shared_tensor_map.size() == m_model->nr_shared_tensor()); | |||
} | |||
OprLoadContextImpl ctx{this, m_model->mge_version()}; | |||
SharedTensorAlignMent tensor_alignment( | |||
m_model_buf, m_file.get(), | |||
m_file->writable() && m_file->is_shared_memory()); | |||
OprLoadContextImpl ctx{this, &tensor_alignment, m_model->mge_version()}; | |||
ctx.load_middle_tensor(); | |||
auto metadata = ctx.load_metadata(); | |||
auto result = ctx.load_oprs(); | |||
@@ -856,6 +949,7 @@ GraphLoader::LoadResult GraphLoaderOSSV2::load(const LoadConfig& config, bool re | |||
} | |||
} | |||
m_model_loaded = true; | |||
tensor_alignment.reorder_and_align_tensor(); | |||
result.graph_compile_ahead(); | |||
return result; | |||
} | |||
@@ -41,6 +41,15 @@ public: | |||
//! return current read offset | |||
virtual size_t tell() = 0; | |||
//! whether this file format support share memory when load model | |||
virtual bool is_shared_memory() { return false; } | |||
//! whether this can be write | |||
virtual bool writable() { return false; } | |||
//! whether this file have been wrote | |||
virtual void have_modified() {} | |||
/*! | |||
* \brief read into a host tensor | |||
* | |||
@@ -208,7 +208,8 @@ public: | |||
* | |||
* It must be dumped with TensorWriteMethod::VALUE_SHARED | |||
*/ | |||
virtual std::shared_ptr<DeviceTensorND> load_tensor_shared() = 0; | |||
virtual std::shared_ptr<DeviceTensorND> load_tensor_shared( | |||
bool copy_immediatly = false) = 0; | |||
//! get associated global configuration | |||
virtual const GraphLoadConfig& config() const = 0; | |||
@@ -104,6 +104,7 @@ class GraphLoaderOSSV2 final : public GraphLoader { | |||
public: | |||
class OprLoadContextImpl; | |||
class SharedTensorAlignMent; | |||
friend class OprLoadContextImpl; | |||
GraphLoaderOSSV2(std::unique_ptr<InputFile> input_file) | |||
@@ -136,22 +137,51 @@ class GraphLoaderOSSV2::OprLoadContextImpl final : public OprLoadContextFlatBuff | |||
size_t m_cur_opr_tensor_cnt; | |||
size_t m_cur_opr_blob_cnt; | |||
size_t m_cur_opr_param_cnt; | |||
SharedTensorAlignMent* m_tensor_alignment; | |||
public: | |||
friend class SharedTensorAlignMent; | |||
ComputingGraph& graph() override { return *m_graph; } | |||
const GraphLoadConfig& config() const override { | |||
return *m_loader->m_cur_load_config; | |||
} | |||
//! shared or copy the loaded flatbuffer memory to the CPU tensor, this can reduce | |||
//! the memory used when load model, but should consider the memory | |||
//! alignment | |||
void fill_tensor_memory( | |||
HostTensorND& tensor, const uint8_t* data, size_t size, bool shared) { | |||
auto tensor_size = tensor.layout().span().high_byte; | |||
mgb_assert( | |||
size == tensor_size, | |||
"the size is not match when shared the flatbuffer memory\n"); | |||
auto ptr = reinterpret_cast<void*>(const_cast<uint8_t*>(data)); | |||
if (shared) { | |||
HostTensorStorage storage; | |||
auto raw_storage = std::shared_ptr<mgb::dt_byte>( | |||
static_cast<mgb::dt_byte*>(ptr), [](void*) {}); | |||
storage.reset(tensor.comp_node(), size, raw_storage); | |||
tensor.reset(storage, tensor.layout()); | |||
} else { | |||
memcpy(tensor.raw_ptr(), data, size); | |||
} | |||
} | |||
std::shared_ptr<HostTensorND> load_tensor() override; | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared() override; | |||
std::shared_ptr<DeviceTensorND> load_tensor_shared( | |||
bool copy_immediatly = false) override; | |||
void load_single_opr(const fbs::v2::Operator* opr); | |||
OprLoadContextImpl(GraphLoaderOSSV2* loader, uint32_t version) | |||
: OprLoadContextFlatBuffers(version), m_loader{loader} { | |||
OprLoadContextImpl( | |||
GraphLoaderOSSV2* loader, SharedTensorAlignMent* tensor_alignment, | |||
uint32_t version) | |||
: OprLoadContextFlatBuffers(version), | |||
m_loader{loader}, | |||
m_tensor_alignment(tensor_alignment) { | |||
m_graph = loader->m_cur_load_config->comp_graph; | |||
if (!m_graph) { | |||
m_graph = ComputingGraph::make(); | |||
@@ -315,7 +315,7 @@ void test_serializer_custom_loader(GraphDumpFormat format) { | |||
load(); | |||
load(); | |||
ASSERT_EQ(2u, saved_val.size()); | |||
ASSERT_EQ(1, load_nr_null_ptr); // immutable tensor is not shared | |||
ASSERT_EQ(2, load_nr_null_ptr); // immutable tensor is also shared | |||
ASSERT_EQ(4, load_nr_call); | |||
} | |||
@@ -482,10 +482,10 @@ void test_serializer_multiple_param(GraphDumpFormat format) { | |||
ASSERT_THROW(loader->shared_tensor_id_map(), MegBrainError); | |||
loader->load(); | |||
auto&& got = loader->shared_tensor_id_map(); | |||
ASSERT_EQ(values.size(), got.size()); | |||
ASSERT_EQ(2 * values.size(), got.size()); | |||
for (size_t i = 0; i < values.size(); ++i) { | |||
ASSERT_EQ(1u, got[i].second.size()); | |||
auto &&vi = *values[i], &&gi = *got[i].second.begin()->second; | |||
auto &&vi = *values[i], &&gi = *got[2 * i].second.begin()->second; | |||
ASSERT_EQ(vi.shape(), gi.shape()); | |||
ASSERT_EQ(vi.comp_node(), gi.comp_node()); | |||
ASSERT_EQ(vi.dtype(), gi.dtype()); | |||
@@ -565,7 +565,7 @@ void test_serializer_const_var_shape(GraphDumpFormat format) { | |||
} | |||
}; | |||
run_and_check(config); | |||
ASSERT_EQ(2, nr_tensor); | |||
ASSERT_EQ(1, nr_tensor); // immutable tensor is shared tensor | |||
ASSERT_EQ(1, nr_mod); | |||
} | |||
} | |||
@@ -823,6 +823,77 @@ void test_serializer_log_exp(GraphDumpFormat format) { | |||
load(); | |||
} | |||
void test_serializer_memshare(GraphDumpFormat format) { | |||
std::vector<uint8_t> buf; | |||
HostTensorGenerator<> gen; | |||
constexpr size_t SIZE = 127; | |||
auto xval = gen({SIZE}, "cpu0"), bval = gen({1}, "cpu0"); | |||
auto dump = [&]() { | |||
auto graph = ComputingGraph::make(); | |||
auto x0 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x0"); | |||
auto x1 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x1"); | |||
auto x2 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x2"); | |||
auto x3 = opr::SharedDeviceTensor::make(*graph, *xval).rename("x3"); | |||
auto i4 = opr::ImmutableTensor::make(*graph, *xval).rename("i4"); | |||
auto i5 = opr::ImmutableTensor::make(*graph, *xval).rename("i5"); | |||
auto b = opr::SharedDeviceTensor::make(*graph, *bval).rename("b"); | |||
auto dumper = GraphDumper::make(OutputFile::make_vector_proxy(&buf), format); | |||
dumper->dump({((x0 + x1) + b) + (x2 + x3) + i4 + i5, x0, i4}); | |||
}; | |||
HostTensorND expected; | |||
expected.copy_from(*xval); | |||
for (size_t i = 0; i < SIZE; ++i) { | |||
auto&& v = expected.ptr<float>()[i]; | |||
v = v * 6 + bval->ptr<float>()[0]; | |||
} | |||
std::vector<uint8_t> buf_al; | |||
auto load = [&](bool share) { | |||
std::unique_ptr<InputFile> fin; | |||
if (share) { | |||
buf_al.resize(buf.size()); | |||
memcpy(buf_al.data(), buf.data(), buf.size()); | |||
fin = InputFile::make_mem_proxy( | |||
std::shared_ptr<void>{std::shared_ptr<void>{}, buf_al.data()}, | |||
buf.size()); | |||
} else { | |||
fin = InputFile::make_mem_proxy(buf.data(), buf.size()); | |||
} | |||
auto loader = GraphLoader::make(std::move(fin), format); | |||
auto rst = loader->load(); | |||
auto x = rst.output_var_map.at("x0"); | |||
auto i4 = rst.output_var_map.at("i4"); | |||
auto&& opr = x.node()->owner_opr()->cast_final_safe<opr::SharedDeviceTensor>(); | |||
auto&& opr_imm = | |||
i4.node()->owner_opr()->cast_final_safe<opr::ImmutableTensor>(); | |||
HostTensorND val; | |||
auto func = | |||
rst.graph_compile({make_callback_copy(rst.output_var_list[0], val)}); | |||
func->execute(); | |||
return std::make_pair( | |||
val, std::vector<DeviceTensorND>{*opr.dev_data(), opr_imm.value()}); | |||
}; | |||
auto in_range = [](const std::vector<uint8_t>& buf, DeviceTensorND& dv) { | |||
auto p0 = reinterpret_cast<uint8_t*>(dv.raw_ptr()), | |||
p1 = reinterpret_cast<uint8_t*>(p0 + dv.layout().span().high_byte); | |||
return buf.data() <= p0 && p1 <= buf.data() + buf.size(); | |||
}; | |||
for (bool share : {false, true}) { | |||
buf.clear(); | |||
dump(); | |||
auto get = load(share); | |||
MGB_ASSERT_TENSOR_EQ(*xval, HostTensorND{}.copy_from(get.second[0]).sync()); | |||
MGB_ASSERT_TENSOR_EQ(expected, get.first); | |||
ASSERT_EQ(share, in_range(buf_al, get.second[0])); | |||
ASSERT_EQ(share, in_range(buf_al, get.second[1])); | |||
} | |||
} | |||
} // namespace | |||
TEST(TestSerializer2, GraphDumpLoad) { | |||
@@ -967,6 +1038,10 @@ TEST(TestSerializer2, LOGEXPV2) { | |||
test_serializer_log_exp(GraphDumpFormat::FLATBUFFERS_V2); | |||
} | |||
TEST(TestSerializer2, ShareMemv2) { | |||
test_serializer_memshare(GraphDumpFormat::FLATBUFFERS_V2); | |||
} | |||
TEST(TestSerializer2, TestSoftMaxLoadDump) { | |||
auto fname = GET_OUTPUT_FILE(GraphDumpFormat::FLATBUFFERS_V2); | |||
TensorShape shape{2, 3}; | |||