diff --git a/src/cambricon/impl/magicmind_runtime_opr.cpp b/src/cambricon/impl/magicmind_runtime_opr.cpp index 9cd91744..c748f11d 100644 --- a/src/cambricon/impl/magicmind_runtime_opr.cpp +++ b/src/cambricon/impl/magicmind_runtime_opr.cpp @@ -10,7 +10,6 @@ */ #include "megbrain/cambricon/magicmind_runtime_opr.h" -#include #include "megbrain/common.h" #include "megbrain/comp_node_env.h" @@ -20,16 +19,6 @@ using namespace mgb; using namespace opr; using namespace magicmind; -#define MM_CHECK(stmt) \ - do { \ - auto ret = (stmt); \ - if (ret != magicmind::Status::OK()) { \ - std::ostringstream msg; \ - msg << ret; \ - mgb_throw(MegBrainError, "mm failure(extra msg:%s)", msg.str().c_str()); \ - } \ - } while (0) - namespace { Dims mgb_shape_to_mm_dims(TensorShape mgb_shp) { size_t ndim = mgb_shp.ndim; @@ -390,7 +379,6 @@ SymbolVarArray MagicMindRuntimeOpr::make( model->DeserializeFromMemory(const_cast(buf), size); return make(std::move(model), std::move(cambricon_allocator), src, config); } -#undef MM_CHECK #endif // MGB_CAMBRICON diff --git a/src/cambricon/impl/magicmind_runtime_opr.sereg.h b/src/cambricon/impl/magicmind_runtime_opr.sereg.h index 6e915029..67b0ab3d 100644 --- a/src/cambricon/impl/magicmind_runtime_opr.sereg.h +++ b/src/cambricon/impl/magicmind_runtime_opr.sereg.h @@ -9,23 +9,12 @@ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. */ -#include #include "megbrain/cambricon/magicmind_runtime_opr.h" #include "megbrain/serialization/sereg.h" namespace mgb { namespace serialization { -#define MM_CHECK(stmt) \ - do { \ - auto ret = (stmt); \ - if (ret != magicmind::Status::OK()) { \ - std::ostringstream msg; \ - msg << ret; \ - mgb_throw(MegBrainError, "mm failure(extra msg:%s)", msg.str().c_str()); \ - } \ - } while (0) - template <> struct OprLoadDumpImpl { static void dump(OprDumpContext& ctx, const cg::OperatorNodeBase& opr_) { @@ -70,7 +59,6 @@ cg::OperatorNodeBase* opr_shallow_copy_magicmind_runtime_opr( MGB_SEREG_OPR(MagicMindRuntimeOpr, 0); MGB_REG_OPR_SHALLOW_COPY(MagicMindRuntimeOpr, opr_shallow_copy_magicmind_runtime_opr); -#undef MM_CHECK } // namespace opr } // namespace mgb diff --git a/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h b/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h index c2f150a7..0695099b 100644 --- a/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h +++ b/src/cambricon/include/megbrain/cambricon/magicmind_runtime_opr.h @@ -16,8 +16,19 @@ #if MGB_CAMBRICON +#include #include "interface_runtime.h" +#define MM_CHECK(stmt) \ + do { \ + auto ret = (stmt); \ + if (ret != magicmind::Status::OK()) { \ + std::ostringstream msg; \ + msg << ret; \ + mgb_throw(MegBrainError, "mm failure(extra msg:%s)", msg.str().c_str()); \ + } \ + } while (0) + namespace mgb { namespace opr { namespace magicmind_intl { diff --git a/src/cambricon/test/magicmind_runtime_opr.cpp b/src/cambricon/test/magicmind_runtime_opr.cpp new file mode 100644 index 00000000..b6f64614 --- /dev/null +++ b/src/cambricon/test/magicmind_runtime_opr.cpp @@ -0,0 +1,824 @@ +/** + * \file src/cambricon/test/magicmind_runtime_opr.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + */ + +#include "megbrain/comp_node_env.h" +#include "megbrain/opr/basic_arith.h" +#include "megbrain/opr/io.h" +#include "megbrain/plugin/profiler.h" +#include "megbrain/serialization/serializer.h" +#include "megbrain/test/helper.h" + +#if MGB_CAMBRICON + +#include "megbrain/cambricon/magicmind_runtime_opr.h" + +#include "interface_builder.h" +#include "interface_network.h" + +using namespace mgb; +using namespace opr; +using namespace magicmind; + +namespace { +template +void gen_rand_data(std::vector& data, size_t num_elems, size_t scale) { + unsigned int seed = time(0); + data.resize(num_elems); + for (size_t i = 0; i < num_elems; ++i) { + data[i] = + static_cast((rand_r(&seed) % (scale * 1000)) / 1000.0 - scale / 2.0); + } +} + +template +void get_min_max(std::vector& data, double& min, double& max) { + min = *std::min_element(data.begin(), data.end()); + max = *std::max_element(data.begin(), data.end()); +} + +void cast_data_type( + std::vector& input, void* output, size_t size, cnrtDataType_t input_type, + cnrtDataType_t output_type, double& min, double& max) { + cnrtQuantizedParam_t param = NULL; + if (output_type == CNRT_INT8 || output_type == CNRT_INT16) { + get_min_max(input, min, max); + int bitwidth = 8; + if (output_type == CNRT_INT8) { + bitwidth = 8; + } else if (output_type == CNRT_INT16) { + bitwidth = 16; + } + auto par_tmp = magicmind::RangeToUniformQuantParamWithQuantAlg( + {min, max}, bitwidth, "symmetric"); + auto par = magicmind::UniformToNormalCast(par_tmp); + MGB_CNRT_CHECK(cnrtCreateQuantizedParam(¶m, par.pos, par.scale, 0)); + } + MGB_CNRT_CHECK(cnrtCastDataType( + reinterpret_cast(input.data()), input_type, output, output_type, + size, param)); +} +cnrtDataType_t convert_data_type(magicmind::DataType dtype) { + static const std::unordered_map dtype_map = { +#define cb(dt_mm_, dt_cnrt_) {magicmind::DataType::dt_mm_, CNRT_##dt_cnrt_} + cb(QINT8, INT8), cb(QINT16, INT16), cb(INT8, INT8), + cb(INT16, INT16), cb(INT32, INT32), cb(UINT8, UINT8), + cb(FLOAT16, FLOAT16), cb(FLOAT32, FLOAT32), + }; + auto it = dtype_map.find(dtype); + if (it != dtype_map.end()) + return it->second; + else { + mgb_assert( + false, "unsupported magicmind dtype(%u).", + static_cast(dtype)); + } +} + +///! taken from src/jit/impl/utils.cpp +void replace_all_pairs_inplace( + std::string& text, + const std::vector>& replace) { + using str = std::string; + auto repl_one = [&text](const str& from, const str& to) { + mgb_assert(!from.empty()); + size_t pos = 0; + while ((pos = text.find(from, pos)) != str::npos) { + text.replace(pos, from.size(), to); + pos += to.size(); + } + }; + for (auto&& i : replace) { + repl_one(i.first, i.second); + } +} + +class MMNetwork { +public: + template + using MagicMindUniquePtr = magicmind_intl::MagicMindUniquePtr; + using IModelPtr = MagicMindRuntimeOpr::IModelPtr; + using IContextPtr = MagicMindRuntimeOpr::IContextPtr; + using IEnginePtr = MagicMindRuntimeOpr::IEnginePtr; + + const CompNode& cn_; + magicmind::DataType op_datatype_; + IModelPtr model_; + bool graph_shape_mutable_; + bool built_; + + template + static MagicMindUniquePtr make_mm_unique_ptr(T* ptr) { + return {ptr, magicmind_intl::MagicMindDeleter()}; + } + + MMNetwork( + const CompNode& cn, magicmind::DataType op_datatype, + bool graph_shape_mutable = false) + : cn_{cn}, + op_datatype_{op_datatype}, + model_{nullptr}, + graph_shape_mutable_{graph_shape_mutable}, + built_{false} {} + void build() { + auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env(); + cnrt_env.activate(); + constexpr int ni = 16, ci = 64, hi = 32, wi = 32; + constexpr int no = 16, co = 64, ho = 32, wo = 32; + constexpr int kh = 3, kw = 3; + constexpr int stride_h = 1, stride_w = 1; + constexpr int pad_h = 1, pad_w = 1; + magicmind::Dims input_dim{{ni, ci, hi, wi}}; + magicmind::Dims filter_dim{{co, ci, kh, kw}}; + magicmind::Dims bias_dim{{co}}; + magicmind::Dims add_dim{{no, co, ho, wo}}; + magicmind::DataType output_datatype = magicmind::DataType::FLOAT32; + + // init + auto builder = make_mm_unique_ptr(magicmind::CreateIBuilder()); + auto config = make_mm_unique_ptr(magicmind::CreateIBuilderConfig()); + std::string user_json_config = R"( +{ + "graph_shape_mutable": {{GRAPH_SHAPE_MUTABLE}}, + "precision_config": { + "precision_mode": "qint8_mixed_float16" + } +} +)"; + replace_all_pairs_inplace( + user_json_config, + {{"{{GRAPH_SHAPE_MUTABLE}}", std::to_string(graph_shape_mutable_)}}); + config->ParseFromString(user_json_config); + auto network = make_mm_unique_ptr(magicmind::CreateINetwork()); + magicmind::Range filter_range = {0.0f, 0.0f}; + // create input tensor + auto init_tensor = [](magicmind::ITensor* tensor, const std::string& name, + const Dims& input_dim) { + magicmind::Range input_range = {0.0f, 0.0f}; + std::vector temp_buffer; + gen_rand_data(temp_buffer, input_dim.GetElementCount(), 256); + get_min_max(temp_buffer, input_range.min, input_range.max); + MM_CHECK(tensor->SetDynamicRange(input_range, false)); + tensor->SetTensorName(name); + }; + auto input_tensor = network->AddInput(op_datatype_, input_dim); + init_tensor(input_tensor, "x", input_dim); + auto add_tensor = network->AddInput(output_datatype, add_dim); + init_tensor(add_tensor, "add", add_dim); + // create filter tensor + magicmind::ITensor* filter_tensor = nullptr; + { + std::vector filter_buf; + gen_rand_data(filter_buf, filter_dim.GetElementCount(), 1); + std::vector filter_buf_intx; + filter_buf_intx.resize( + filter_dim.GetElementCount() * + magicmind::DataTypeSize(op_datatype_)); + cast_data_type( + filter_buf, reinterpret_cast(filter_buf_intx.data()), + filter_dim.GetElementCount(), CNRT_FLOAT32, + convert_data_type(op_datatype_), filter_range.min, + filter_range.max); + auto filter = network->AddIConstNode( + op_datatype_, filter_dim, + reinterpret_cast(filter_buf_intx.data())); + filter_tensor = filter->GetOutput(0); + filter_tensor->SetDynamicRange(filter_range, false); + } + + // create bias tensor + magicmind::ITensor* bias_tensor = nullptr; + { + std::vector bias_buf; + gen_rand_data(bias_buf, bias_dim.GetElementCount(), 1); + std::vector bias_buf_floatx; + if (output_datatype == magicmind::DataType::FLOAT16) { + bias_buf_floatx.resize( + bias_dim.GetElementCount() * + magicmind::DataTypeSize(output_datatype)); + double min = 0., max = 0.; + cast_data_type( + bias_buf, reinterpret_cast(bias_buf_floatx.data()), + bias_dim.GetElementCount(), CNRT_FLOAT32, + convert_data_type(output_datatype), min, max); + auto bias = network->AddIConstNode( + output_datatype, bias_dim, + reinterpret_cast(bias_buf_floatx.data())); + bias_tensor = bias->GetOutput(0); + } else { + auto bias = network->AddIConstNode( + output_datatype, bias_dim, + reinterpret_cast(bias_buf.data())); + bias_tensor = bias->GetOutput(0); + } + } + + // x w bias + // \ / | + // | / + // conv + // | + // relu ------ out1 + // \ add + // \ / + // | + // out2 + + // create conv + relu node + auto conv = network->AddIConvNode(input_tensor, filter_tensor, bias_tensor); + MM_CHECK(conv->SetStride(stride_h, stride_w)); + MM_CHECK(conv->SetPad(pad_h, pad_w, pad_h, pad_w)); + MM_CHECK(conv->SetDilation(1, 1)); + MM_CHECK(conv->SetPaddingMode(magicmind::IPaddingMode::EXPLICIT)); + auto conv_output = conv->GetOutput(0); + // conv output tensor datatype should be set same with bias tensor + MM_CHECK(conv->SetOutputType(0, output_datatype)); + // relu output tensor datatype will be same with input tensor + auto relu = + network->AddIActivationNode(conv_output, magicmind::IActivation::RELU); + MM_CHECK(relu->SetOutputType(0, output_datatype)); + relu->GetOutput(0)->SetTensorName("out1"); + + // set outputs nodes + MM_CHECK(network->MarkOutput(relu->GetOutput(0))); + + // create elemwise add + auto add = network->AddIElementwiseNode( + relu->GetOutput(0), add_tensor, magicmind::IElementwise::ADD); + add->GetOutput(0)->SetTensorName("out2"); + MM_CHECK(network->MarkOutput(add->GetOutput(0))); + + // create model + model_ = { + builder->BuildModel("model", network.get(), config.get()), + magicmind_intl::MagicMindDeleter()}; + mgb_assert(model_ != nullptr); + + built_ = true; + } + + const IModelPtr& get_inference_model() { + if (!built_) + build(); + return model_; + } + + std::string get_serialized_model(bool serialize_to_file) { + if (!built_) + build(); + size_t size = 0; + MM_CHECK(model_->GetSerializedModelSize(&size)); + std::string buf; + buf.resize(size); + MM_CHECK(model_->SerializeToMemory(reinterpret_cast(buf.data()), size)); + if (serialize_to_file) { + std::string fname = ssprintf( + "./output/MagicMindRuntimeOprTest.%s.mlu", + graph_shape_mutable_ ? "GraphShapeMutable" + : "GraphShapeImmutableBatch"); + model_->SerializeToFile(fname.c_str()); + } + return buf; + } + + void infer_model( + const std::vector& inputs, const std::vector& outputs, + const std::vector& input_dims) { + if (!built_) + build(); + auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env(); + cnrt_env.activate(); + auto engine = make_mm_unique_ptr(model_->CreateIEngine()); + mgb_assert(engine != nullptr); + auto context = make_mm_unique_ptr(engine->CreateIContext()); + mgb_assert(context != nullptr); + + // create and get irttensor from context + std::vector input_tensors; + std::vector output_tensors; + MM_CHECK(CreateInputTensors(context.get(), &input_tensors)); + MM_CHECK(CreateOutputTensors(context.get(), &output_tensors)); + MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetDimensions(input_dims[0])); + MM_CHECK(FindIRTTensorByName(input_tensors, "add") + ->SetDimensions(input_dims[1])); + MM_CHECK(context->InferOutputShape(input_tensors, output_tensors)); + MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetData(inputs[0])); + MM_CHECK(FindIRTTensorByName(input_tensors, "add")->SetData(inputs[1])); + MM_CHECK(FindIRTTensorByName(output_tensors, "out1")->SetData(outputs[0])); + MM_CHECK(FindIRTTensorByName(output_tensors, "out2")->SetData(outputs[1])); + + auto&& queue = cnrt_env.queue; + cnrtNotifier_t start, end; + MGB_CNRT_CHECK(cnrtCreateNotifier(&start)); + MGB_CNRT_CHECK(cnrtCreateNotifier(&end)); + MGB_CNRT_CHECK(cnrtPlaceNotifier(start, queue)); + + constexpr size_t runs = 50; + for (size_t i = 0; i < runs; ++i) { + MM_CHECK(context->Enqueue(input_tensors, output_tensors, queue)); + } + + MGB_CNRT_CHECK(cnrtPlaceNotifier(end, queue)); + MGB_CNRT_CHECK(cnrtSyncQueue(queue)); + float time = 0.f; + MGB_CNRT_CHECK(cnrtNotifierDuration(start, end, &time)); + printf("inference time = %.2fs\n", time / static_cast(runs) * 1e-3); + MGB_CNRT_CHECK(cnrtDestroyNotifier(&start)); + MGB_CNRT_CHECK(cnrtDestroyNotifier(&end)); + } +}; +} // namespace + +TEST(TestMagicMindRuntimeOpr, Basic) { + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + MMNetwork network(cn, magicmind::DataType::FLOAT32, false); + size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32); + + // prepare parameter for addpad and conv + const int ni = 16, ci = 64, hi = 32, wi = 32; + const int no = 16, co = 64, ho = 32, wo = 32; + + // count tensor nums + int conv_input_count = ni * hi * wi * ci; + int relu_output_count = no * ho * wo * co; + + // prepare cpu origin data + std::vector conv_input_cpu_data; + gen_rand_data(conv_input_cpu_data, conv_input_count, 256); + std::vector add_input_cpu_data; + gen_rand_data(add_input_cpu_data, relu_output_count, 256); + std::vector relu_output_cpu_data(relu_output_count); + std::vector add_output_cpu_data(relu_output_count); + + auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; + void* conv_input_mlu_ptr; + void* add_input_mlu_ptr; + void* relu_output_mlu_ptr; + void* add_output_mlu_ptr; + + // malloc mlu mem for fusion input and output + MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); + MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); + + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + conv_input_mlu_ptr, conv_input_cpu_data.data(), + conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_input_mlu_ptr, add_input_cpu_data.data(), + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); + std::unique_ptr conv_input_holder{ + conv_input_mlu_ptr, mlu_deleter}; + std::unique_ptr add_input_holder{ + add_input_mlu_ptr, mlu_deleter}; + std::unique_ptr relu_output_holder{ + relu_output_mlu_ptr, mlu_deleter}; + std::unique_ptr add_output_holder{ + add_output_mlu_ptr, mlu_deleter}; + + network.infer_model( + {conv_input_mlu_ptr, add_output_mlu_ptr}, + {relu_output_mlu_ptr, add_output_mlu_ptr}, + {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + + // result memory copy cnml->cpu + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + relu_output_cpu_data.data(), relu_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_output_cpu_data.data(), add_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + + auto buf = network.get_serialized_model(false); + auto x = std::make_shared( + cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); + auto add = std::make_shared( + cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + std::memcpy( + reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), + conv_input_count * sizeof(float)); + std::memcpy( + reinterpret_cast(add->ptr()), add_input_cpu_data.data(), + relu_output_count * sizeof(float)); + auto graph = ComputingGraph::make(); + auto x_ = opr::Host2DeviceCopy::make(*graph, x); + auto add_ = opr::Host2DeviceCopy::make(*graph, add); + auto outs = opr::MagicMindRuntimeOpr::make( + reinterpret_cast(buf.data()), buf.size(), {x_, add_}); + auto out1 = outs[0]; + auto out2 = outs[1]; + HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + auto func = graph->compile( + {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); + func->execute(); + HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), + o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); + std::memcpy( + o1_mm.ptr(), relu_output_cpu_data.data(), + relu_output_count * sizeof(float)); + std::memcpy( + o2_mm.ptr(), add_output_cpu_data.data(), + relu_output_count * sizeof(float)); + MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); + MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); +} + +TEST(TestMagicMindRuntimeOpr, InputQInt8) { + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + MMNetwork network(cn, magicmind::DataType::QINT8, false); + size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::QINT8); + + // prepare parameter for addpad and conv + const int ni = 16, ci = 64, hi = 32, wi = 32; + const int no = 16, co = 64, ho = 32, wo = 32; + + // count tensor nums + int conv_input_count = ni * hi * wi * ci; + int relu_output_count = no * ho * wo * co; + + // prepare cpu origin data + std::vector conv_input_cpu_data; + gen_rand_data(conv_input_cpu_data, conv_input_count, 256); + std::vector add_input_cpu_data; + gen_rand_data(add_input_cpu_data, relu_output_count, 256); + std::vector relu_output_cpu_data(relu_output_count); + std::vector add_output_cpu_data(relu_output_count); + + auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; + void* conv_input_mlu_ptr; + void* add_input_mlu_ptr; + void* relu_output_mlu_ptr; + void* add_output_mlu_ptr; + + // malloc mlu mem for fusion input and output + MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); + MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); + + MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + conv_input_mlu_ptr, conv_input_cpu_data.data(), + conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_input_mlu_ptr, add_input_cpu_data.data(), + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); + std::unique_ptr conv_input_holder{ + conv_input_mlu_ptr, mlu_deleter}; + std::unique_ptr add_input_holder{ + add_input_mlu_ptr, mlu_deleter}; + std::unique_ptr relu_output_holder{ + relu_output_mlu_ptr, mlu_deleter}; + std::unique_ptr add_output_holder{ + add_output_mlu_ptr, mlu_deleter}; + + network.infer_model( + {conv_input_mlu_ptr, add_output_mlu_ptr}, + {relu_output_mlu_ptr, add_output_mlu_ptr}, + {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + + // result memory copy cnml->cpu + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + relu_output_cpu_data.data(), relu_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_output_cpu_data.data(), add_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + + auto buf = network.get_serialized_model(false); + auto x = std::make_shared( + cn, TensorLayout{{ni, ci, hi, wi}, dtype::QuantizedS8{1.f}}); + auto add = std::make_shared( + cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + std::memcpy( + reinterpret_cast(x->raw_ptr()), conv_input_cpu_data.data(), + conv_input_count * sizeof(int8_t)); + std::memcpy( + reinterpret_cast(add->ptr()), add_input_cpu_data.data(), + relu_output_count * sizeof(float)); + auto graph = ComputingGraph::make(); + auto x_ = opr::Host2DeviceCopy::make(*graph, x); + auto add_ = opr::Host2DeviceCopy::make(*graph, add); + auto outs = opr::MagicMindRuntimeOpr::make( + reinterpret_cast(buf.data()), buf.size(), {x_, add_}); + auto out1 = outs[0]; + auto out2 = outs[1]; + HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + auto func = graph->compile( + {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); + func->execute(); + HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), + o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); + std::memcpy( + o1_mm.ptr(), relu_output_cpu_data.data(), + relu_output_count * sizeof(float)); + std::memcpy( + o2_mm.ptr(), add_output_cpu_data.data(), + relu_output_count * sizeof(float)); + MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); + MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); +} + +TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) { + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + MMNetwork network(cn, magicmind::DataType::FLOAT32, true); + size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32); + + auto check = [&](magicmind::Dims input_dim, magicmind::Dims output_dim) { + // prepare parameter for addpad and conv + const int ni = input_dim[0], ci = input_dim[1], hi = input_dim[2], + wi = input_dim[3]; + const int no = output_dim[0], co = output_dim[1], ho = output_dim[2], + wo = output_dim[3]; + + // count tensor nums + int conv_input_count = ni * hi * wi * ci; + int relu_output_count = no * ho * wo * co; + + // prepare cpu origin data + std::vector conv_input_cpu_data; + gen_rand_data(conv_input_cpu_data, conv_input_count, 256); + std::vector add_input_cpu_data; + gen_rand_data(add_input_cpu_data, relu_output_count, 256); + std::vector relu_output_cpu_data(relu_output_count); + std::vector add_output_cpu_data(relu_output_count); + + auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; + void* conv_input_mlu_ptr; + void* add_input_mlu_ptr; + void* relu_output_mlu_ptr; + void* add_output_mlu_ptr; + + // malloc mlu mem for fusion input and output + MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); + MGB_CNRT_CHECK( + cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK( + cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK( + cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); + + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + conv_input_mlu_ptr, conv_input_cpu_data.data(), + conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_input_mlu_ptr, add_input_cpu_data.data(), + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); + std::unique_ptr conv_input_holder{ + conv_input_mlu_ptr, mlu_deleter}; + std::unique_ptr add_input_holder{ + add_input_mlu_ptr, mlu_deleter}; + std::unique_ptr relu_output_holder{ + relu_output_mlu_ptr, mlu_deleter}; + std::unique_ptr add_output_holder{ + add_output_mlu_ptr, mlu_deleter}; + + network.infer_model( + {conv_input_mlu_ptr, add_output_mlu_ptr}, + {relu_output_mlu_ptr, add_output_mlu_ptr}, + {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + + // result memory copy cnml->cpu + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + relu_output_cpu_data.data(), relu_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_output_cpu_data.data(), add_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + + auto buf = network.get_serialized_model(true); + auto mkshp = [](int n, int c, int h, int w) { + size_t nz = n, cz = c, hz = h, wz = w; + return TensorShape{nz, cz, hz, wz}; + }; + auto mkly = [](int n, int c, int h, int w, DType dtype) { + size_t nz = n, cz = c, hz = h, wz = w; + return TensorLayout{{nz, cz, hz, wz}, dtype}; + }; + auto x = std::make_shared( + cn, mkly(ni, ci, hi, wi, dtype::Float32())); + auto add = std::make_shared( + cn, mkly(no, co, ho, wo, dtype::Float32())); + std::memcpy( + reinterpret_cast(x->ptr()), + conv_input_cpu_data.data(), conv_input_count * sizeof(float)); + std::memcpy( + reinterpret_cast(add->ptr()), + add_input_cpu_data.data(), relu_output_count * sizeof(float)); + auto graph = ComputingGraph::make(); + auto x_ = opr::Host2DeviceCopy::make(*graph, x); + auto add_ = opr::Host2DeviceCopy::make(*graph, add); + auto outs = opr::MagicMindRuntimeOpr::make( + reinterpret_cast(buf.data()), buf.size(), {x_, add_}); + auto out1 = outs[0]; + auto out2 = outs[1]; + HostTensorND o1(cn, mkshp(no, co, ho, wo), dtype::Float32()); + HostTensorND o2(cn, mkshp(no, co, ho, wo), dtype::Float32()); + auto func = graph->compile( + {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); + func->execute(); + HostTensorND o1_mm(cn, mkshp(no, co, ho, wo), dtype::Float32()), + o2_mm(cn, mkshp(no, co, ho, wo), dtype::Float32()); + std::memcpy( + o1_mm.ptr(), relu_output_cpu_data.data(), + relu_output_count * sizeof(float)); + std::memcpy( + o2_mm.ptr(), add_output_cpu_data.data(), + relu_output_count * sizeof(float)); + MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); + MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); + }; + check(Dims{{1, 64, 32, 32}}, Dims{{1, 64, 32, 32}}); + check(Dims{{32, 64, 32, 32}}, Dims{{32, 64, 32, 32}}); + check(Dims{{7, 64, 16, 16}}, Dims{{7, 64, 16, 16}}); +} + +TEST(TestMagicMindRuntimeOpr, Serialization) { + using namespace serialization; + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + MMNetwork network(cn, magicmind::DataType::FLOAT32, true); + auto buf = network.get_serialized_model(false); + + // prepare parameter for addpad and conv + const int ni = 1, ci = 64, hi = 32, wi = 32; + const int no = 1, co = 64, ho = 32, wo = 32; + auto x = std::make_shared( + cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); + auto add = std::make_shared( + cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + auto graph = ComputingGraph::make(); + auto x_ = opr::Host2DeviceCopy::make(*graph, x); + auto add_ = opr::Host2DeviceCopy::make(*graph, add); + auto outs = opr::MagicMindRuntimeOpr::make( + reinterpret_cast(buf.data()), buf.size(), {x_, add_}); + auto out1 = outs[0]; + auto out2 = outs[1]; + auto fname = output_file("MagicMindRuntimeOprTest"); + auto dump = [&]() { + auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str())); + auto rst = dumper->dump({out1, out2}); + ASSERT_EQ(rst.outputs.size(), 2u); + }; + auto load = [&]() { + auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str())); + auto rst = loader->load(); + ASSERT_EQ(rst.output_var_list.size(), 2u); + }; + dump(); + load(); +} + +TEST(TestMagicMindRuntimeOpr, Profiling) { + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + MMNetwork network(cn, magicmind::DataType::FLOAT32, true); + auto buf = network.get_serialized_model(false); + const int ni = 8, ci = 64, hi = 32, wi = 32; + const int no = 1, co = 64, ho = 32, wo = 32; + + HostTensorGenerator gen(0, 1); + auto x = gen({ni, ci, hi, wi}, cn); + auto add = gen({no, co, ho, wo}, cn); + + auto graph = ComputingGraph::make(); + GraphProfiler profiler{graph.get()}; + auto x_ = opr::Host2DeviceCopy::make(*graph, x); + auto add_ = opr::Host2DeviceCopy::make(*graph, add); + auto outs = opr::MagicMindRuntimeOpr::make( + reinterpret_cast(buf.data()), buf.size(), {x_, add_}); + auto out1 = outs[0]; + auto out2 = outs[1]; + graph->options().var_sanity_check_first_run = false; + HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + auto func = graph->compile( + {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); + func->execute(); + profiler.to_json_full(func.get()) + ->writeto_fpath(output_file("magicmind_runtime_opr_profile.json")); +} + +TEST(TestMagicMindRuntimeOpr, CrossCNCopy) { + REQUIRE_CAMBRICON_DEVICE(1); + auto cn = CompNode::load("cambricon0"); + MMNetwork network(cn, magicmind::DataType::FLOAT32, false); + size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32); + + // prepare parameter for addpad and conv + const int ni = 16, ci = 64, hi = 32, wi = 32; + const int no = 16, co = 64, ho = 32, wo = 32; + + // count tensor nums + int conv_input_count = ni * hi * wi * ci; + int relu_output_count = no * ho * wo * co; + + // prepare cpu origin data + std::vector conv_input_cpu_data; + gen_rand_data(conv_input_cpu_data, conv_input_count, 256); + std::vector add_input_cpu_data; + gen_rand_data(add_input_cpu_data, relu_output_count, 256); + std::vector relu_output_cpu_data(relu_output_count); + std::vector add_output_cpu_data(relu_output_count); + + auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); }; + void* conv_input_mlu_ptr; + void* add_input_mlu_ptr; + void* relu_output_mlu_ptr; + void* add_output_mlu_ptr; + + // malloc mlu mem for fusion input and output + MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size)); + MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float))); + MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float))); + + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + conv_input_mlu_ptr, conv_input_cpu_data.data(), + conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_input_mlu_ptr, add_input_cpu_data.data(), + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV)); + std::unique_ptr conv_input_holder{ + conv_input_mlu_ptr, mlu_deleter}; + std::unique_ptr add_input_holder{ + add_input_mlu_ptr, mlu_deleter}; + std::unique_ptr relu_output_holder{ + relu_output_mlu_ptr, mlu_deleter}; + std::unique_ptr add_output_holder{ + add_output_mlu_ptr, mlu_deleter}; + + network.infer_model( + {conv_input_mlu_ptr, add_output_mlu_ptr}, + {relu_output_mlu_ptr, add_output_mlu_ptr}, + {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}}); + + // result memory copy cnml->cpu + // memory copy cpu->mlu + MGB_CNRT_CHECK(cnrtMemcpy( + relu_output_cpu_data.data(), relu_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + MGB_CNRT_CHECK(cnrtMemcpy( + add_output_cpu_data.data(), add_output_mlu_ptr, + relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST)); + + auto cn_cpu = CompNode::load("cpu0"); + auto buf = network.get_serialized_model(false); + auto x = std::make_shared( + cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()}); + auto add = std::make_shared( + cn_cpu, TensorLayout{{no, co, ho, wo}, dtype::Float32()}); + std::memcpy( + reinterpret_cast(x->ptr()), conv_input_cpu_data.data(), + conv_input_count * sizeof(float)); + std::memcpy( + reinterpret_cast(add->ptr()), add_input_cpu_data.data(), + relu_output_count * sizeof(float)); + auto graph = ComputingGraph::make(); + auto x_ = opr::Host2DeviceCopy::make(*graph, x, {cn_cpu}); + auto add_ = opr::Host2DeviceCopy::make(*graph, add, {cn_cpu}); + x_ = opr::Copy::make(x_, {cn}); + add_ = opr::Copy::make(add_, {cn}); + auto outs = opr::MagicMindRuntimeOpr::make( + reinterpret_cast(buf.data()), buf.size(), {x_, add_}); + auto out1 = outs[0]; + auto out2 = outs[1]; + HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32()); + HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32()); + auto func = graph->compile( + {make_callback_copy(out1, o1), make_callback_copy(out2, o2)}); + func->execute(); + HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()), + o2_mm(cn, {no, co, ho, wo}, dtype::Float32()); + std::memcpy( + o1_mm.ptr(), relu_output_cpu_data.data(), + relu_output_count * sizeof(float)); + std::memcpy( + o2_mm.ptr(), add_output_cpu_data.data(), + relu_output_count * sizeof(float)); + MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4); + MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4); +} + +#endif + +// vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}