|
- /**
- * \file src/cambricon/test/magicmind_runtime_opr.cpp
- * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
- *
- * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- */
-
- #include "megbrain/comp_node_env.h"
- #include "megbrain/opr/basic_arith.h"
- #include "megbrain/opr/io.h"
- #include "megbrain/plugin/profiler.h"
- #include "megbrain/serialization/serializer.h"
- #include "megbrain/test/helper.h"
-
- #if MGB_CAMBRICON
-
- #include "megbrain/cambricon/magicmind_runtime_opr.h"
-
- #include "interface_builder.h"
- #include "interface_network.h"
-
- using namespace mgb;
- using namespace opr;
- using namespace magicmind;
-
- namespace {
- template <typename T>
- void gen_rand_data(std::vector<T>& data, size_t num_elems, size_t scale) {
- unsigned int seed = time(0);
- data.resize(num_elems);
- for (size_t i = 0; i < num_elems; ++i) {
- data[i] =
- static_cast<T>((rand_r(&seed) % (scale * 1000)) / 1000.0 - scale / 2.0);
- }
- }
-
- template <typename T>
- void get_min_max(std::vector<T>& data, double& min, double& max) {
- min = *std::min_element(data.begin(), data.end());
- max = *std::max_element(data.begin(), data.end());
- }
-
- void cast_data_type(
- std::vector<float>& input, void* output, size_t size, cnrtDataType_t input_type,
- cnrtDataType_t output_type, double& min, double& max) {
- cnrtQuantizedParam_t param = NULL;
- if (output_type == CNRT_INT8 || output_type == CNRT_INT16) {
- get_min_max(input, min, max);
- int bitwidth = 8;
- if (output_type == CNRT_INT8) {
- bitwidth = 8;
- } else if (output_type == CNRT_INT16) {
- bitwidth = 16;
- }
- auto par_tmp = magicmind::RangeToUniformQuantParamWithQuantAlg(
- {min, max}, bitwidth, "symmetric");
- auto par = magicmind::UniformToNormalCast(par_tmp);
- MGB_CNRT_CHECK(cnrtCreateQuantizedParam(¶m, par.pos, par.scale, 0));
- }
- MGB_CNRT_CHECK(cnrtCastDataType(
- reinterpret_cast<void*>(input.data()), input_type, output, output_type,
- size, param));
- }
- cnrtDataType_t convert_data_type(magicmind::DataType dtype) {
- static const std::unordered_map<magicmind::DataType, cnrtDataType_t> dtype_map = {
- #define cb(dt_mm_, dt_cnrt_) {magicmind::DataType::dt_mm_, CNRT_##dt_cnrt_}
- cb(QINT8, INT8), cb(QINT16, INT16), cb(INT8, INT8),
- cb(INT16, INT16), cb(INT32, INT32), cb(UINT8, UINT8),
- cb(FLOAT16, FLOAT16), cb(FLOAT32, FLOAT32),
- };
- auto it = dtype_map.find(dtype);
- if (it != dtype_map.end())
- return it->second;
- else {
- mgb_assert(
- false, "unsupported magicmind dtype(%u).",
- static_cast<uint32_t>(dtype));
- }
- }
-
- ///! taken from src/jit/impl/utils.cpp
- void replace_all_pairs_inplace(
- std::string& text,
- const std::vector<std::pair<std::string, std::string>>& replace) {
- using str = std::string;
- auto repl_one = [&text](const str& from, const str& to) {
- mgb_assert(!from.empty());
- size_t pos = 0;
- while ((pos = text.find(from, pos)) != str::npos) {
- text.replace(pos, from.size(), to);
- pos += to.size();
- }
- };
- for (auto&& i : replace) {
- repl_one(i.first, i.second);
- }
- }
-
- class MMNetwork {
- public:
- template <typename T>
- using MagicMindUniquePtr = magicmind_intl::MagicMindUniquePtr<T>;
- using IModelPtr = MagicMindRuntimeOpr::IModelPtr;
- using IContextPtr = MagicMindRuntimeOpr::IContextPtr;
- using IEnginePtr = MagicMindRuntimeOpr::IEnginePtr;
-
- const CompNode& cn_;
- magicmind::DataType op_datatype_;
- IModelPtr model_;
- bool graph_shape_mutable_;
- bool built_;
-
- template <typename T>
- static MagicMindUniquePtr<T> make_mm_unique_ptr(T* ptr) {
- return {ptr, magicmind_intl::MagicMindDeleter<T>()};
- }
-
- MMNetwork(
- const CompNode& cn, magicmind::DataType op_datatype,
- bool graph_shape_mutable = false)
- : cn_{cn},
- op_datatype_{op_datatype},
- model_{nullptr},
- graph_shape_mutable_{graph_shape_mutable},
- built_{false} {}
- void build() {
- auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env();
- cnrt_env.activate();
- constexpr int ni = 16, ci = 64, hi = 32, wi = 32;
- constexpr int no = 16, co = 64, ho = 32, wo = 32;
- constexpr int kh = 3, kw = 3;
- constexpr int stride_h = 1, stride_w = 1;
- constexpr int pad_h = 1, pad_w = 1;
- magicmind::Dims input_dim{{ni, ci, hi, wi}};
- magicmind::Dims filter_dim{{co, ci, kh, kw}};
- magicmind::Dims bias_dim{{co}};
- magicmind::Dims add_dim{{no, co, ho, wo}};
- magicmind::DataType output_datatype = magicmind::DataType::FLOAT32;
-
- // init
- auto builder = make_mm_unique_ptr(magicmind::CreateIBuilder());
- auto config = make_mm_unique_ptr(magicmind::CreateIBuilderConfig());
- std::string user_json_config = R"(
- {
- "graph_shape_mutable": {{GRAPH_SHAPE_MUTABLE}},
- "precision_config": {
- "precision_mode": "qint8_mixed_float16"
- }
- }
- )";
- replace_all_pairs_inplace(
- user_json_config,
- {{"{{GRAPH_SHAPE_MUTABLE}}", std::to_string(graph_shape_mutable_)}});
- config->ParseFromString(user_json_config);
- auto network = make_mm_unique_ptr(magicmind::CreateINetwork());
- magicmind::Range filter_range = {0.0f, 0.0f};
- // create input tensor
- auto init_tensor = [](magicmind::ITensor* tensor, const std::string& name,
- const Dims& input_dim) {
- magicmind::Range input_range = {0.0f, 0.0f};
- std::vector<float> temp_buffer;
- gen_rand_data(temp_buffer, input_dim.GetElementCount(), 256);
- get_min_max(temp_buffer, input_range.min, input_range.max);
- MM_CHECK(tensor->SetDynamicRange(input_range, false));
- tensor->SetTensorName(name);
- };
- auto input_tensor = network->AddInput(op_datatype_, input_dim);
- init_tensor(input_tensor, "x", input_dim);
- auto add_tensor = network->AddInput(output_datatype, add_dim);
- init_tensor(add_tensor, "add", add_dim);
- // create filter tensor
- magicmind::ITensor* filter_tensor = nullptr;
- {
- std::vector<float> filter_buf;
- gen_rand_data(filter_buf, filter_dim.GetElementCount(), 1);
- std::vector<uint8_t> filter_buf_intx;
- filter_buf_intx.resize(
- filter_dim.GetElementCount() *
- magicmind::DataTypeSize(op_datatype_));
- cast_data_type(
- filter_buf, reinterpret_cast<void*>(filter_buf_intx.data()),
- filter_dim.GetElementCount(), CNRT_FLOAT32,
- convert_data_type(op_datatype_), filter_range.min,
- filter_range.max);
- auto filter = network->AddIConstNode(
- op_datatype_, filter_dim,
- reinterpret_cast<void*>(filter_buf_intx.data()));
- filter_tensor = filter->GetOutput(0);
- filter_tensor->SetDynamicRange(filter_range, false);
- }
-
- // create bias tensor
- magicmind::ITensor* bias_tensor = nullptr;
- {
- std::vector<float> bias_buf;
- gen_rand_data(bias_buf, bias_dim.GetElementCount(), 1);
- std::vector<uint8_t> bias_buf_floatx;
- if (output_datatype == magicmind::DataType::FLOAT16) {
- bias_buf_floatx.resize(
- bias_dim.GetElementCount() *
- magicmind::DataTypeSize(output_datatype));
- double min = 0., max = 0.;
- cast_data_type(
- bias_buf, reinterpret_cast<void*>(bias_buf_floatx.data()),
- bias_dim.GetElementCount(), CNRT_FLOAT32,
- convert_data_type(output_datatype), min, max);
- auto bias = network->AddIConstNode(
- output_datatype, bias_dim,
- reinterpret_cast<void*>(bias_buf_floatx.data()));
- bias_tensor = bias->GetOutput(0);
- } else {
- auto bias = network->AddIConstNode(
- output_datatype, bias_dim,
- reinterpret_cast<void*>(bias_buf.data()));
- bias_tensor = bias->GetOutput(0);
- }
- }
-
- // x w bias
- // \ / |
- // | /
- // conv
- // |
- // relu ------ out1
- // \ add
- // \ /
- // |
- // out2
-
- // create conv + relu node
- auto conv = network->AddIConvNode(input_tensor, filter_tensor, bias_tensor);
- MM_CHECK(conv->SetStride(stride_h, stride_w));
- MM_CHECK(conv->SetPad(pad_h, pad_w, pad_h, pad_w));
- MM_CHECK(conv->SetDilation(1, 1));
- MM_CHECK(conv->SetPaddingMode(magicmind::IPaddingMode::EXPLICIT));
- auto conv_output = conv->GetOutput(0);
- // conv output tensor datatype should be set same with bias tensor
- MM_CHECK(conv->SetOutputType(0, output_datatype));
- // relu output tensor datatype will be same with input tensor
- auto relu =
- network->AddIActivationNode(conv_output, magicmind::IActivation::RELU);
- MM_CHECK(relu->SetOutputType(0, output_datatype));
- relu->GetOutput(0)->SetTensorName("out1");
-
- // set outputs nodes
- MM_CHECK(network->MarkOutput(relu->GetOutput(0)));
-
- // create elemwise add
- auto add = network->AddIElementwiseNode(
- relu->GetOutput(0), add_tensor, magicmind::IElementwise::ADD);
- add->GetOutput(0)->SetTensorName("out2");
- MM_CHECK(network->MarkOutput(add->GetOutput(0)));
-
- // create model
- model_ = {
- builder->BuildModel("model", network.get(), config.get()),
- magicmind_intl::MagicMindDeleter<magicmind::IModel>()};
- mgb_assert(model_ != nullptr);
-
- built_ = true;
- }
-
- const IModelPtr& get_inference_model() {
- if (!built_)
- build();
- return model_;
- }
-
- std::string get_serialized_model(bool serialize_to_file) {
- if (!built_)
- build();
- size_t size = 0;
- MM_CHECK(model_->GetSerializedModelSize(&size));
- std::string buf;
- buf.resize(size);
- MM_CHECK(model_->SerializeToMemory(reinterpret_cast<void*>(buf.data()), size));
- if (serialize_to_file) {
- std::string fname = ssprintf(
- "./output/MagicMindRuntimeOprTest.%s.mlu",
- graph_shape_mutable_ ? "GraphShapeMutable"
- : "GraphShapeImmutableBatch");
- model_->SerializeToFile(fname.c_str());
- }
- return buf;
- }
-
- void infer_model(
- const std::vector<void*>& inputs, const std::vector<void*>& outputs,
- const std::vector<Dims>& input_dims) {
- if (!built_)
- build();
- auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env();
- cnrt_env.activate();
- auto engine = make_mm_unique_ptr(model_->CreateIEngine());
- mgb_assert(engine != nullptr);
- auto context = make_mm_unique_ptr(engine->CreateIContext());
- mgb_assert(context != nullptr);
-
- // create and get irttensor from context
- std::vector<magicmind::IRTTensor*> input_tensors;
- std::vector<magicmind::IRTTensor*> output_tensors;
- MM_CHECK(CreateInputTensors(context.get(), &input_tensors));
- MM_CHECK(CreateOutputTensors(context.get(), &output_tensors));
- MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetDimensions(input_dims[0]));
- MM_CHECK(FindIRTTensorByName(input_tensors, "add")
- ->SetDimensions(input_dims[1]));
- MM_CHECK(context->InferOutputShape(input_tensors, output_tensors));
- MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetData(inputs[0]));
- MM_CHECK(FindIRTTensorByName(input_tensors, "add")->SetData(inputs[1]));
- MM_CHECK(FindIRTTensorByName(output_tensors, "out1")->SetData(outputs[0]));
- MM_CHECK(FindIRTTensorByName(output_tensors, "out2")->SetData(outputs[1]));
-
- auto&& queue = cnrt_env.queue;
- cnrtNotifier_t start, end;
- MGB_CNRT_CHECK(cnrtCreateNotifier(&start));
- MGB_CNRT_CHECK(cnrtCreateNotifier(&end));
- MGB_CNRT_CHECK(cnrtPlaceNotifier(start, queue));
-
- constexpr size_t runs = 50;
- for (size_t i = 0; i < runs; ++i) {
- MM_CHECK(context->Enqueue(input_tensors, output_tensors, queue));
- }
-
- MGB_CNRT_CHECK(cnrtPlaceNotifier(end, queue));
- MGB_CNRT_CHECK(cnrtSyncQueue(queue));
- float time = 0.f;
- MGB_CNRT_CHECK(cnrtNotifierDuration(start, end, &time));
- printf("inference time = %.2fs\n", time / static_cast<float>(runs) * 1e-3);
- MGB_CNRT_CHECK(cnrtDestroyNotifier(&start));
- MGB_CNRT_CHECK(cnrtDestroyNotifier(&end));
- }
- };
- } // namespace
-
- TEST(TestMagicMindRuntimeOpr, Basic) {
- REQUIRE_CAMBRICON_DEVICE(1);
- auto cn = CompNode::load("cambricon0");
- MMNetwork network(cn, magicmind::DataType::FLOAT32, false);
- size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32);
-
- // prepare parameter for addpad and conv
- const int ni = 16, ci = 64, hi = 32, wi = 32;
- const int no = 16, co = 64, ho = 32, wo = 32;
-
- // count tensor nums
- int conv_input_count = ni * hi * wi * ci;
- int relu_output_count = no * ho * wo * co;
-
- // prepare cpu origin data
- std::vector<float> conv_input_cpu_data;
- gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
- std::vector<float> add_input_cpu_data;
- gen_rand_data(add_input_cpu_data, relu_output_count, 256);
- std::vector<float> relu_output_cpu_data(relu_output_count);
- std::vector<float> add_output_cpu_data(relu_output_count);
-
- auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
- void* conv_input_mlu_ptr;
- void* add_input_mlu_ptr;
- void* relu_output_mlu_ptr;
- void* add_output_mlu_ptr;
-
- // malloc mlu mem for fusion input and output
- MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
- MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
-
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- conv_input_mlu_ptr, conv_input_cpu_data.data(),
- conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_input_mlu_ptr, add_input_cpu_data.data(),
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
- std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
- conv_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
- add_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
- relu_output_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
- add_output_mlu_ptr, mlu_deleter};
-
- network.infer_model(
- {conv_input_mlu_ptr, add_output_mlu_ptr},
- {relu_output_mlu_ptr, add_output_mlu_ptr},
- {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
-
- // result memory copy cnml->cpu
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- relu_output_cpu_data.data(), relu_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_output_cpu_data.data(), add_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
-
- auto buf = network.get_serialized_model(false);
- auto x = std::make_shared<HostTensorND>(
- cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
- auto add = std::make_shared<HostTensorND>(
- cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
- std::memcpy(
- reinterpret_cast<void*>(x->ptr<dt_float32>()), conv_input_cpu_data.data(),
- conv_input_count * sizeof(float));
- std::memcpy(
- reinterpret_cast<void*>(add->ptr<dt_float32>()), add_input_cpu_data.data(),
- relu_output_count * sizeof(float));
- auto graph = ComputingGraph::make();
- auto x_ = opr::Host2DeviceCopy::make(*graph, x);
- auto add_ = opr::Host2DeviceCopy::make(*graph, add);
- auto outs = opr::MagicMindRuntimeOpr::make(
- reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
- auto out1 = outs[0];
- auto out2 = outs[1];
- HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
- HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
- auto func = graph->compile(
- {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
- func->execute();
- HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
- o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
- std::memcpy(
- o1_mm.ptr<float>(), relu_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- std::memcpy(
- o2_mm.ptr<float>(), add_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
- MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
- }
-
- TEST(TestMagicMindRuntimeOpr, InputQInt8) {
- REQUIRE_CAMBRICON_DEVICE(1);
- auto cn = CompNode::load("cambricon0");
- MMNetwork network(cn, magicmind::DataType::QINT8, false);
- size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::QINT8);
-
- // prepare parameter for addpad and conv
- const int ni = 16, ci = 64, hi = 32, wi = 32;
- const int no = 16, co = 64, ho = 32, wo = 32;
-
- // count tensor nums
- int conv_input_count = ni * hi * wi * ci;
- int relu_output_count = no * ho * wo * co;
-
- // prepare cpu origin data
- std::vector<int8_t> conv_input_cpu_data;
- gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
- std::vector<float> add_input_cpu_data;
- gen_rand_data(add_input_cpu_data, relu_output_count, 256);
- std::vector<float> relu_output_cpu_data(relu_output_count);
- std::vector<float> add_output_cpu_data(relu_output_count);
-
- auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
- void* conv_input_mlu_ptr;
- void* add_input_mlu_ptr;
- void* relu_output_mlu_ptr;
- void* add_output_mlu_ptr;
-
- // malloc mlu mem for fusion input and output
- MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
- MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
-
- MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- conv_input_mlu_ptr, conv_input_cpu_data.data(),
- conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_input_mlu_ptr, add_input_cpu_data.data(),
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
- std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
- conv_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
- add_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
- relu_output_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
- add_output_mlu_ptr, mlu_deleter};
-
- network.infer_model(
- {conv_input_mlu_ptr, add_output_mlu_ptr},
- {relu_output_mlu_ptr, add_output_mlu_ptr},
- {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
-
- // result memory copy cnml->cpu
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- relu_output_cpu_data.data(), relu_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_output_cpu_data.data(), add_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
-
- auto buf = network.get_serialized_model(false);
- auto x = std::make_shared<HostTensorND>(
- cn, TensorLayout{{ni, ci, hi, wi}, dtype::QuantizedS8{1.f}});
- auto add = std::make_shared<HostTensorND>(
- cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
- std::memcpy(
- reinterpret_cast<void*>(x->raw_ptr()), conv_input_cpu_data.data(),
- conv_input_count * sizeof(int8_t));
- std::memcpy(
- reinterpret_cast<void*>(add->ptr<dt_float32>()), add_input_cpu_data.data(),
- relu_output_count * sizeof(float));
- auto graph = ComputingGraph::make();
- auto x_ = opr::Host2DeviceCopy::make(*graph, x);
- auto add_ = opr::Host2DeviceCopy::make(*graph, add);
- auto outs = opr::MagicMindRuntimeOpr::make(
- reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
- auto out1 = outs[0];
- auto out2 = outs[1];
- HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
- HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
- auto func = graph->compile(
- {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
- func->execute();
- HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
- o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
- std::memcpy(
- o1_mm.ptr<float>(), relu_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- std::memcpy(
- o2_mm.ptr<float>(), add_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
- MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
- }
-
- TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) {
- REQUIRE_CAMBRICON_DEVICE(1);
- auto cn = CompNode::load("cambricon0");
- MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
- size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32);
-
- auto check = [&](magicmind::Dims input_dim, magicmind::Dims output_dim) {
- // prepare parameter for addpad and conv
- const int ni = input_dim[0], ci = input_dim[1], hi = input_dim[2],
- wi = input_dim[3];
- const int no = output_dim[0], co = output_dim[1], ho = output_dim[2],
- wo = output_dim[3];
-
- // count tensor nums
- int conv_input_count = ni * hi * wi * ci;
- int relu_output_count = no * ho * wo * co;
-
- // prepare cpu origin data
- std::vector<float> conv_input_cpu_data;
- gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
- std::vector<float> add_input_cpu_data;
- gen_rand_data(add_input_cpu_data, relu_output_count, 256);
- std::vector<float> relu_output_cpu_data(relu_output_count);
- std::vector<float> add_output_cpu_data(relu_output_count);
-
- auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
- void* conv_input_mlu_ptr;
- void* add_input_mlu_ptr;
- void* relu_output_mlu_ptr;
- void* add_output_mlu_ptr;
-
- // malloc mlu mem for fusion input and output
- MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
- MGB_CNRT_CHECK(
- cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(
- cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(
- cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
-
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- conv_input_mlu_ptr, conv_input_cpu_data.data(),
- conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_input_mlu_ptr, add_input_cpu_data.data(),
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
- std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
- conv_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
- add_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
- relu_output_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
- add_output_mlu_ptr, mlu_deleter};
-
- network.infer_model(
- {conv_input_mlu_ptr, add_output_mlu_ptr},
- {relu_output_mlu_ptr, add_output_mlu_ptr},
- {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
-
- // result memory copy cnml->cpu
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- relu_output_cpu_data.data(), relu_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_output_cpu_data.data(), add_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
-
- auto buf = network.get_serialized_model(true);
- auto mkshp = [](int n, int c, int h, int w) {
- size_t nz = n, cz = c, hz = h, wz = w;
- return TensorShape{nz, cz, hz, wz};
- };
- auto mkly = [](int n, int c, int h, int w, DType dtype) {
- size_t nz = n, cz = c, hz = h, wz = w;
- return TensorLayout{{nz, cz, hz, wz}, dtype};
- };
- auto x = std::make_shared<HostTensorND>(
- cn, mkly(ni, ci, hi, wi, dtype::Float32()));
- auto add = std::make_shared<HostTensorND>(
- cn, mkly(no, co, ho, wo, dtype::Float32()));
- std::memcpy(
- reinterpret_cast<void*>(x->ptr<dt_float32>()),
- conv_input_cpu_data.data(), conv_input_count * sizeof(float));
- std::memcpy(
- reinterpret_cast<void*>(add->ptr<dt_float32>()),
- add_input_cpu_data.data(), relu_output_count * sizeof(float));
- auto graph = ComputingGraph::make();
- auto x_ = opr::Host2DeviceCopy::make(*graph, x);
- auto add_ = opr::Host2DeviceCopy::make(*graph, add);
- auto outs = opr::MagicMindRuntimeOpr::make(
- reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
- auto out1 = outs[0];
- auto out2 = outs[1];
- HostTensorND o1(cn, mkshp(no, co, ho, wo), dtype::Float32());
- HostTensorND o2(cn, mkshp(no, co, ho, wo), dtype::Float32());
- auto func = graph->compile(
- {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
- func->execute();
- HostTensorND o1_mm(cn, mkshp(no, co, ho, wo), dtype::Float32()),
- o2_mm(cn, mkshp(no, co, ho, wo), dtype::Float32());
- std::memcpy(
- o1_mm.ptr<float>(), relu_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- std::memcpy(
- o2_mm.ptr<float>(), add_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
- MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
- };
- check(Dims{{1, 64, 32, 32}}, Dims{{1, 64, 32, 32}});
- check(Dims{{32, 64, 32, 32}}, Dims{{32, 64, 32, 32}});
- check(Dims{{7, 64, 16, 16}}, Dims{{7, 64, 16, 16}});
- }
-
- TEST(TestMagicMindRuntimeOpr, Serialization) {
- using namespace serialization;
- REQUIRE_CAMBRICON_DEVICE(1);
- auto cn = CompNode::load("cambricon0");
- MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
- auto buf = network.get_serialized_model(false);
-
- // prepare parameter for addpad and conv
- const int ni = 1, ci = 64, hi = 32, wi = 32;
- const int no = 1, co = 64, ho = 32, wo = 32;
- auto x = std::make_shared<HostTensorND>(
- cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
- auto add = std::make_shared<HostTensorND>(
- cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
- auto graph = ComputingGraph::make();
- auto x_ = opr::Host2DeviceCopy::make(*graph, x);
- auto add_ = opr::Host2DeviceCopy::make(*graph, add);
- auto outs = opr::MagicMindRuntimeOpr::make(
- reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
- auto out1 = outs[0];
- auto out2 = outs[1];
- auto fname = output_file("MagicMindRuntimeOprTest");
- auto dump = [&]() {
- auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
- auto rst = dumper->dump({out1, out2});
- ASSERT_EQ(rst.outputs.size(), 2u);
- };
- auto load = [&]() {
- auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
- auto rst = loader->load();
- ASSERT_EQ(rst.output_var_list.size(), 2u);
- };
- dump();
- load();
- }
-
- TEST(TestMagicMindRuntimeOpr, Profiling) {
- REQUIRE_CAMBRICON_DEVICE(1);
- auto cn = CompNode::load("cambricon0");
- MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
- auto buf = network.get_serialized_model(false);
- const int ni = 8, ci = 64, hi = 32, wi = 32;
- const int no = 1, co = 64, ho = 32, wo = 32;
-
- HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen(0, 1);
- auto x = gen({ni, ci, hi, wi}, cn);
- auto add = gen({no, co, ho, wo}, cn);
-
- auto graph = ComputingGraph::make();
- GraphProfiler profiler{graph.get()};
- auto x_ = opr::Host2DeviceCopy::make(*graph, x);
- auto add_ = opr::Host2DeviceCopy::make(*graph, add);
- auto outs = opr::MagicMindRuntimeOpr::make(
- reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
- auto out1 = outs[0];
- auto out2 = outs[1];
- graph->options().var_sanity_check_first_run = false;
- HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
- HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
- auto func = graph->compile(
- {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
- func->execute();
- profiler.to_json_full(func.get())
- ->writeto_fpath(output_file("magicmind_runtime_opr_profile.json"));
- }
-
- TEST(TestMagicMindRuntimeOpr, CrossCNCopy) {
- REQUIRE_CAMBRICON_DEVICE(1);
- auto cn = CompNode::load("cambricon0");
- MMNetwork network(cn, magicmind::DataType::FLOAT32, false);
- size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32);
-
- // prepare parameter for addpad and conv
- const int ni = 16, ci = 64, hi = 32, wi = 32;
- const int no = 16, co = 64, ho = 32, wo = 32;
-
- // count tensor nums
- int conv_input_count = ni * hi * wi * ci;
- int relu_output_count = no * ho * wo * co;
-
- // prepare cpu origin data
- std::vector<float> conv_input_cpu_data;
- gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
- std::vector<float> add_input_cpu_data;
- gen_rand_data(add_input_cpu_data, relu_output_count, 256);
- std::vector<float> relu_output_cpu_data(relu_output_count);
- std::vector<float> add_output_cpu_data(relu_output_count);
-
- auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
- void* conv_input_mlu_ptr;
- void* add_input_mlu_ptr;
- void* relu_output_mlu_ptr;
- void* add_output_mlu_ptr;
-
- // malloc mlu mem for fusion input and output
- MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
- MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
- MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
-
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- conv_input_mlu_ptr, conv_input_cpu_data.data(),
- conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_input_mlu_ptr, add_input_cpu_data.data(),
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
- std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
- conv_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
- add_input_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
- relu_output_mlu_ptr, mlu_deleter};
- std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
- add_output_mlu_ptr, mlu_deleter};
-
- network.infer_model(
- {conv_input_mlu_ptr, add_output_mlu_ptr},
- {relu_output_mlu_ptr, add_output_mlu_ptr},
- {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
-
- // result memory copy cnml->cpu
- // memory copy cpu->mlu
- MGB_CNRT_CHECK(cnrtMemcpy(
- relu_output_cpu_data.data(), relu_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
- MGB_CNRT_CHECK(cnrtMemcpy(
- add_output_cpu_data.data(), add_output_mlu_ptr,
- relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
-
- auto cn_cpu = CompNode::load("cpu0");
- auto buf = network.get_serialized_model(false);
- auto x = std::make_shared<HostTensorND>(
- cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
- auto add = std::make_shared<HostTensorND>(
- cn_cpu, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
- std::memcpy(
- reinterpret_cast<void*>(x->ptr<dt_float32>()), conv_input_cpu_data.data(),
- conv_input_count * sizeof(float));
- std::memcpy(
- reinterpret_cast<void*>(add->ptr<dt_float32>()), add_input_cpu_data.data(),
- relu_output_count * sizeof(float));
- auto graph = ComputingGraph::make();
- auto x_ = opr::Host2DeviceCopy::make(*graph, x, {cn_cpu});
- auto add_ = opr::Host2DeviceCopy::make(*graph, add, {cn_cpu});
- x_ = opr::Copy::make(x_, {cn});
- add_ = opr::Copy::make(add_, {cn});
- auto outs = opr::MagicMindRuntimeOpr::make(
- reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
- auto out1 = outs[0];
- auto out2 = outs[1];
- HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
- HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
- auto func = graph->compile(
- {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
- func->execute();
- HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
- o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
- std::memcpy(
- o1_mm.ptr<float>(), relu_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- std::memcpy(
- o2_mm.ptr<float>(), add_output_cpu_data.data(),
- relu_output_count * sizeof(float));
- MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
- MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
- }
-
- #endif
-
- // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
|