You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

magicmind_runtime_opr.cpp 36 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824
  1. /**
  2. * \file src/cambricon/test/magicmind_runtime_opr.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include "megbrain/comp_node_env.h"
  12. #include "megbrain/opr/basic_arith.h"
  13. #include "megbrain/opr/io.h"
  14. #include "megbrain/plugin/profiler.h"
  15. #include "megbrain/serialization/serializer.h"
  16. #include "megbrain/test/helper.h"
  17. #if MGB_CAMBRICON
  18. #include "megbrain/cambricon/magicmind_runtime_opr.h"
  19. #include "interface_builder.h"
  20. #include "interface_network.h"
  21. using namespace mgb;
  22. using namespace opr;
  23. using namespace magicmind;
  24. namespace {
  25. template <typename T>
  26. void gen_rand_data(std::vector<T>& data, size_t num_elems, size_t scale) {
  27. unsigned int seed = time(0);
  28. data.resize(num_elems);
  29. for (size_t i = 0; i < num_elems; ++i) {
  30. data[i] =
  31. static_cast<T>((rand_r(&seed) % (scale * 1000)) / 1000.0 - scale / 2.0);
  32. }
  33. }
  34. template <typename T>
  35. void get_min_max(std::vector<T>& data, double& min, double& max) {
  36. min = *std::min_element(data.begin(), data.end());
  37. max = *std::max_element(data.begin(), data.end());
  38. }
  39. void cast_data_type(
  40. std::vector<float>& input, void* output, size_t size, cnrtDataType_t input_type,
  41. cnrtDataType_t output_type, double& min, double& max) {
  42. cnrtQuantizedParam_t param = NULL;
  43. if (output_type == CNRT_INT8 || output_type == CNRT_INT16) {
  44. get_min_max(input, min, max);
  45. int bitwidth = 8;
  46. if (output_type == CNRT_INT8) {
  47. bitwidth = 8;
  48. } else if (output_type == CNRT_INT16) {
  49. bitwidth = 16;
  50. }
  51. auto par_tmp = magicmind::RangeToUniformQuantParamWithQuantAlg(
  52. {min, max}, bitwidth, "symmetric");
  53. auto par = magicmind::UniformToNormalCast(par_tmp);
  54. MGB_CNRT_CHECK(cnrtCreateQuantizedParam(&param, par.pos, par.scale, 0));
  55. }
  56. MGB_CNRT_CHECK(cnrtCastDataType(
  57. reinterpret_cast<void*>(input.data()), input_type, output, output_type,
  58. size, param));
  59. }
  60. cnrtDataType_t convert_data_type(magicmind::DataType dtype) {
  61. static const std::unordered_map<magicmind::DataType, cnrtDataType_t> dtype_map = {
  62. #define cb(dt_mm_, dt_cnrt_) {magicmind::DataType::dt_mm_, CNRT_##dt_cnrt_}
  63. cb(QINT8, INT8), cb(QINT16, INT16), cb(INT8, INT8),
  64. cb(INT16, INT16), cb(INT32, INT32), cb(UINT8, UINT8),
  65. cb(FLOAT16, FLOAT16), cb(FLOAT32, FLOAT32),
  66. };
  67. auto it = dtype_map.find(dtype);
  68. if (it != dtype_map.end())
  69. return it->second;
  70. else {
  71. mgb_assert(
  72. false, "unsupported magicmind dtype(%u).",
  73. static_cast<uint32_t>(dtype));
  74. }
  75. }
  76. ///! taken from src/jit/impl/utils.cpp
  77. void replace_all_pairs_inplace(
  78. std::string& text,
  79. const std::vector<std::pair<std::string, std::string>>& replace) {
  80. using str = std::string;
  81. auto repl_one = [&text](const str& from, const str& to) {
  82. mgb_assert(!from.empty());
  83. size_t pos = 0;
  84. while ((pos = text.find(from, pos)) != str::npos) {
  85. text.replace(pos, from.size(), to);
  86. pos += to.size();
  87. }
  88. };
  89. for (auto&& i : replace) {
  90. repl_one(i.first, i.second);
  91. }
  92. }
  93. class MMNetwork {
  94. public:
  95. template <typename T>
  96. using MagicMindUniquePtr = magicmind_intl::MagicMindUniquePtr<T>;
  97. using IModelPtr = MagicMindRuntimeOpr::IModelPtr;
  98. using IContextPtr = MagicMindRuntimeOpr::IContextPtr;
  99. using IEnginePtr = MagicMindRuntimeOpr::IEnginePtr;
  100. const CompNode& cn_;
  101. magicmind::DataType op_datatype_;
  102. IModelPtr model_;
  103. bool graph_shape_mutable_;
  104. bool built_;
  105. template <typename T>
  106. static MagicMindUniquePtr<T> make_mm_unique_ptr(T* ptr) {
  107. return {ptr, magicmind_intl::MagicMindDeleter<T>()};
  108. }
  109. MMNetwork(
  110. const CompNode& cn, magicmind::DataType op_datatype,
  111. bool graph_shape_mutable = false)
  112. : cn_{cn},
  113. op_datatype_{op_datatype},
  114. model_{nullptr},
  115. graph_shape_mutable_{graph_shape_mutable},
  116. built_{false} {}
  117. void build() {
  118. auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env();
  119. cnrt_env.activate();
  120. constexpr int ni = 16, ci = 64, hi = 32, wi = 32;
  121. constexpr int no = 16, co = 64, ho = 32, wo = 32;
  122. constexpr int kh = 3, kw = 3;
  123. constexpr int stride_h = 1, stride_w = 1;
  124. constexpr int pad_h = 1, pad_w = 1;
  125. magicmind::Dims input_dim{{ni, ci, hi, wi}};
  126. magicmind::Dims filter_dim{{co, ci, kh, kw}};
  127. magicmind::Dims bias_dim{{co}};
  128. magicmind::Dims add_dim{{no, co, ho, wo}};
  129. magicmind::DataType output_datatype = magicmind::DataType::FLOAT32;
  130. // init
  131. auto builder = make_mm_unique_ptr(magicmind::CreateIBuilder());
  132. auto config = make_mm_unique_ptr(magicmind::CreateIBuilderConfig());
  133. std::string user_json_config = R"(
  134. {
  135. "graph_shape_mutable": {{GRAPH_SHAPE_MUTABLE}},
  136. "precision_config": {
  137. "precision_mode": "qint8_mixed_float16"
  138. }
  139. }
  140. )";
  141. replace_all_pairs_inplace(
  142. user_json_config,
  143. {{"{{GRAPH_SHAPE_MUTABLE}}", std::to_string(graph_shape_mutable_)}});
  144. config->ParseFromString(user_json_config);
  145. auto network = make_mm_unique_ptr(magicmind::CreateINetwork());
  146. magicmind::Range filter_range = {0.0f, 0.0f};
  147. // create input tensor
  148. auto init_tensor = [](magicmind::ITensor* tensor, const std::string& name,
  149. const Dims& input_dim) {
  150. magicmind::Range input_range = {0.0f, 0.0f};
  151. std::vector<float> temp_buffer;
  152. gen_rand_data(temp_buffer, input_dim.GetElementCount(), 256);
  153. get_min_max(temp_buffer, input_range.min, input_range.max);
  154. MM_CHECK(tensor->SetDynamicRange(input_range, false));
  155. tensor->SetTensorName(name);
  156. };
  157. auto input_tensor = network->AddInput(op_datatype_, input_dim);
  158. init_tensor(input_tensor, "x", input_dim);
  159. auto add_tensor = network->AddInput(output_datatype, add_dim);
  160. init_tensor(add_tensor, "add", add_dim);
  161. // create filter tensor
  162. magicmind::ITensor* filter_tensor = nullptr;
  163. {
  164. std::vector<float> filter_buf;
  165. gen_rand_data(filter_buf, filter_dim.GetElementCount(), 1);
  166. std::vector<uint8_t> filter_buf_intx;
  167. filter_buf_intx.resize(
  168. filter_dim.GetElementCount() *
  169. magicmind::DataTypeSize(op_datatype_));
  170. cast_data_type(
  171. filter_buf, reinterpret_cast<void*>(filter_buf_intx.data()),
  172. filter_dim.GetElementCount(), CNRT_FLOAT32,
  173. convert_data_type(op_datatype_), filter_range.min,
  174. filter_range.max);
  175. auto filter = network->AddIConstNode(
  176. op_datatype_, filter_dim,
  177. reinterpret_cast<void*>(filter_buf_intx.data()));
  178. filter_tensor = filter->GetOutput(0);
  179. filter_tensor->SetDynamicRange(filter_range, false);
  180. }
  181. // create bias tensor
  182. magicmind::ITensor* bias_tensor = nullptr;
  183. {
  184. std::vector<float> bias_buf;
  185. gen_rand_data(bias_buf, bias_dim.GetElementCount(), 1);
  186. std::vector<uint8_t> bias_buf_floatx;
  187. if (output_datatype == magicmind::DataType::FLOAT16) {
  188. bias_buf_floatx.resize(
  189. bias_dim.GetElementCount() *
  190. magicmind::DataTypeSize(output_datatype));
  191. double min = 0., max = 0.;
  192. cast_data_type(
  193. bias_buf, reinterpret_cast<void*>(bias_buf_floatx.data()),
  194. bias_dim.GetElementCount(), CNRT_FLOAT32,
  195. convert_data_type(output_datatype), min, max);
  196. auto bias = network->AddIConstNode(
  197. output_datatype, bias_dim,
  198. reinterpret_cast<void*>(bias_buf_floatx.data()));
  199. bias_tensor = bias->GetOutput(0);
  200. } else {
  201. auto bias = network->AddIConstNode(
  202. output_datatype, bias_dim,
  203. reinterpret_cast<void*>(bias_buf.data()));
  204. bias_tensor = bias->GetOutput(0);
  205. }
  206. }
  207. // x w bias
  208. // \ / |
  209. // | /
  210. // conv
  211. // |
  212. // relu ------ out1
  213. // \ add
  214. // \ /
  215. // |
  216. // out2
  217. // create conv + relu node
  218. auto conv = network->AddIConvNode(input_tensor, filter_tensor, bias_tensor);
  219. MM_CHECK(conv->SetStride(stride_h, stride_w));
  220. MM_CHECK(conv->SetPad(pad_h, pad_w, pad_h, pad_w));
  221. MM_CHECK(conv->SetDilation(1, 1));
  222. MM_CHECK(conv->SetPaddingMode(magicmind::IPaddingMode::EXPLICIT));
  223. auto conv_output = conv->GetOutput(0);
  224. // conv output tensor datatype should be set same with bias tensor
  225. MM_CHECK(conv->SetOutputType(0, output_datatype));
  226. // relu output tensor datatype will be same with input tensor
  227. auto relu =
  228. network->AddIActivationNode(conv_output, magicmind::IActivation::RELU);
  229. MM_CHECK(relu->SetOutputType(0, output_datatype));
  230. relu->GetOutput(0)->SetTensorName("out1");
  231. // set outputs nodes
  232. MM_CHECK(network->MarkOutput(relu->GetOutput(0)));
  233. // create elemwise add
  234. auto add = network->AddIElementwiseNode(
  235. relu->GetOutput(0), add_tensor, magicmind::IElementwise::ADD);
  236. add->GetOutput(0)->SetTensorName("out2");
  237. MM_CHECK(network->MarkOutput(add->GetOutput(0)));
  238. // create model
  239. model_ = {
  240. builder->BuildModel("model", network.get(), config.get()),
  241. magicmind_intl::MagicMindDeleter<magicmind::IModel>()};
  242. mgb_assert(model_ != nullptr);
  243. built_ = true;
  244. }
  245. const IModelPtr& get_inference_model() {
  246. if (!built_)
  247. build();
  248. return model_;
  249. }
  250. std::string get_serialized_model(bool serialize_to_file) {
  251. if (!built_)
  252. build();
  253. size_t size = 0;
  254. MM_CHECK(model_->GetSerializedModelSize(&size));
  255. std::string buf;
  256. buf.resize(size);
  257. MM_CHECK(model_->SerializeToMemory(reinterpret_cast<void*>(buf.data()), size));
  258. if (serialize_to_file) {
  259. std::string fname = ssprintf(
  260. "./output/MagicMindRuntimeOprTest.%s.mlu",
  261. graph_shape_mutable_ ? "GraphShapeMutable"
  262. : "GraphShapeImmutableBatch");
  263. model_->SerializeToFile(fname.c_str());
  264. }
  265. return buf;
  266. }
  267. void infer_model(
  268. const std::vector<void*>& inputs, const std::vector<void*>& outputs,
  269. const std::vector<Dims>& input_dims) {
  270. if (!built_)
  271. build();
  272. auto&& cnrt_env = CompNodeEnv::from_comp_node(cn_).cnrt_env();
  273. cnrt_env.activate();
  274. auto engine = make_mm_unique_ptr(model_->CreateIEngine());
  275. mgb_assert(engine != nullptr);
  276. auto context = make_mm_unique_ptr(engine->CreateIContext());
  277. mgb_assert(context != nullptr);
  278. // create and get irttensor from context
  279. std::vector<magicmind::IRTTensor*> input_tensors;
  280. std::vector<magicmind::IRTTensor*> output_tensors;
  281. MM_CHECK(CreateInputTensors(context.get(), &input_tensors));
  282. MM_CHECK(CreateOutputTensors(context.get(), &output_tensors));
  283. MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetDimensions(input_dims[0]));
  284. MM_CHECK(FindIRTTensorByName(input_tensors, "add")
  285. ->SetDimensions(input_dims[1]));
  286. MM_CHECK(context->InferOutputShape(input_tensors, output_tensors));
  287. MM_CHECK(FindIRTTensorByName(input_tensors, "x")->SetData(inputs[0]));
  288. MM_CHECK(FindIRTTensorByName(input_tensors, "add")->SetData(inputs[1]));
  289. MM_CHECK(FindIRTTensorByName(output_tensors, "out1")->SetData(outputs[0]));
  290. MM_CHECK(FindIRTTensorByName(output_tensors, "out2")->SetData(outputs[1]));
  291. auto&& queue = cnrt_env.queue;
  292. cnrtNotifier_t start, end;
  293. MGB_CNRT_CHECK(cnrtCreateNotifier(&start));
  294. MGB_CNRT_CHECK(cnrtCreateNotifier(&end));
  295. MGB_CNRT_CHECK(cnrtPlaceNotifier(start, queue));
  296. constexpr size_t runs = 50;
  297. for (size_t i = 0; i < runs; ++i) {
  298. MM_CHECK(context->Enqueue(input_tensors, output_tensors, queue));
  299. }
  300. MGB_CNRT_CHECK(cnrtPlaceNotifier(end, queue));
  301. MGB_CNRT_CHECK(cnrtSyncQueue(queue));
  302. float time = 0.f;
  303. MGB_CNRT_CHECK(cnrtNotifierDuration(start, end, &time));
  304. printf("inference time = %.2fs\n", time / static_cast<float>(runs) * 1e-3);
  305. MGB_CNRT_CHECK(cnrtDestroyNotifier(&start));
  306. MGB_CNRT_CHECK(cnrtDestroyNotifier(&end));
  307. }
  308. };
  309. } // namespace
  310. TEST(TestMagicMindRuntimeOpr, Basic) {
  311. REQUIRE_CAMBRICON_DEVICE(1);
  312. auto cn = CompNode::load("cambricon0");
  313. MMNetwork network(cn, magicmind::DataType::FLOAT32, false);
  314. size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32);
  315. // prepare parameter for addpad and conv
  316. const int ni = 16, ci = 64, hi = 32, wi = 32;
  317. const int no = 16, co = 64, ho = 32, wo = 32;
  318. // count tensor nums
  319. int conv_input_count = ni * hi * wi * ci;
  320. int relu_output_count = no * ho * wo * co;
  321. // prepare cpu origin data
  322. std::vector<float> conv_input_cpu_data;
  323. gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
  324. std::vector<float> add_input_cpu_data;
  325. gen_rand_data(add_input_cpu_data, relu_output_count, 256);
  326. std::vector<float> relu_output_cpu_data(relu_output_count);
  327. std::vector<float> add_output_cpu_data(relu_output_count);
  328. auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
  329. void* conv_input_mlu_ptr;
  330. void* add_input_mlu_ptr;
  331. void* relu_output_mlu_ptr;
  332. void* add_output_mlu_ptr;
  333. // malloc mlu mem for fusion input and output
  334. MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
  335. MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
  336. MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
  337. MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
  338. // memory copy cpu->mlu
  339. MGB_CNRT_CHECK(cnrtMemcpy(
  340. conv_input_mlu_ptr, conv_input_cpu_data.data(),
  341. conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
  342. MGB_CNRT_CHECK(cnrtMemcpy(
  343. add_input_mlu_ptr, add_input_cpu_data.data(),
  344. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
  345. std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
  346. conv_input_mlu_ptr, mlu_deleter};
  347. std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
  348. add_input_mlu_ptr, mlu_deleter};
  349. std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
  350. relu_output_mlu_ptr, mlu_deleter};
  351. std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
  352. add_output_mlu_ptr, mlu_deleter};
  353. network.infer_model(
  354. {conv_input_mlu_ptr, add_output_mlu_ptr},
  355. {relu_output_mlu_ptr, add_output_mlu_ptr},
  356. {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
  357. // result memory copy cnml->cpu
  358. // memory copy cpu->mlu
  359. MGB_CNRT_CHECK(cnrtMemcpy(
  360. relu_output_cpu_data.data(), relu_output_mlu_ptr,
  361. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  362. MGB_CNRT_CHECK(cnrtMemcpy(
  363. add_output_cpu_data.data(), add_output_mlu_ptr,
  364. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  365. auto buf = network.get_serialized_model(false);
  366. auto x = std::make_shared<HostTensorND>(
  367. cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
  368. auto add = std::make_shared<HostTensorND>(
  369. cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
  370. std::memcpy(
  371. reinterpret_cast<void*>(x->ptr<dt_float32>()), conv_input_cpu_data.data(),
  372. conv_input_count * sizeof(float));
  373. std::memcpy(
  374. reinterpret_cast<void*>(add->ptr<dt_float32>()), add_input_cpu_data.data(),
  375. relu_output_count * sizeof(float));
  376. auto graph = ComputingGraph::make();
  377. auto x_ = opr::Host2DeviceCopy::make(*graph, x);
  378. auto add_ = opr::Host2DeviceCopy::make(*graph, add);
  379. auto outs = opr::MagicMindRuntimeOpr::make(
  380. reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
  381. auto out1 = outs[0];
  382. auto out2 = outs[1];
  383. HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
  384. HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
  385. auto func = graph->compile(
  386. {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
  387. func->execute();
  388. HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
  389. o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
  390. std::memcpy(
  391. o1_mm.ptr<float>(), relu_output_cpu_data.data(),
  392. relu_output_count * sizeof(float));
  393. std::memcpy(
  394. o2_mm.ptr<float>(), add_output_cpu_data.data(),
  395. relu_output_count * sizeof(float));
  396. MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
  397. MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
  398. }
  399. TEST(TestMagicMindRuntimeOpr, InputQInt8) {
  400. REQUIRE_CAMBRICON_DEVICE(1);
  401. auto cn = CompNode::load("cambricon0");
  402. MMNetwork network(cn, magicmind::DataType::QINT8, false);
  403. size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::QINT8);
  404. // prepare parameter for addpad and conv
  405. const int ni = 16, ci = 64, hi = 32, wi = 32;
  406. const int no = 16, co = 64, ho = 32, wo = 32;
  407. // count tensor nums
  408. int conv_input_count = ni * hi * wi * ci;
  409. int relu_output_count = no * ho * wo * co;
  410. // prepare cpu origin data
  411. std::vector<int8_t> conv_input_cpu_data;
  412. gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
  413. std::vector<float> add_input_cpu_data;
  414. gen_rand_data(add_input_cpu_data, relu_output_count, 256);
  415. std::vector<float> relu_output_cpu_data(relu_output_count);
  416. std::vector<float> add_output_cpu_data(relu_output_count);
  417. auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
  418. void* conv_input_mlu_ptr;
  419. void* add_input_mlu_ptr;
  420. void* relu_output_mlu_ptr;
  421. void* add_output_mlu_ptr;
  422. // malloc mlu mem for fusion input and output
  423. MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
  424. MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
  425. MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
  426. MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
  427. // memory copy cpu->mlu
  428. MGB_CNRT_CHECK(cnrtMemcpy(
  429. conv_input_mlu_ptr, conv_input_cpu_data.data(),
  430. conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
  431. MGB_CNRT_CHECK(cnrtMemcpy(
  432. add_input_mlu_ptr, add_input_cpu_data.data(),
  433. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
  434. std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
  435. conv_input_mlu_ptr, mlu_deleter};
  436. std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
  437. add_input_mlu_ptr, mlu_deleter};
  438. std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
  439. relu_output_mlu_ptr, mlu_deleter};
  440. std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
  441. add_output_mlu_ptr, mlu_deleter};
  442. network.infer_model(
  443. {conv_input_mlu_ptr, add_output_mlu_ptr},
  444. {relu_output_mlu_ptr, add_output_mlu_ptr},
  445. {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
  446. // result memory copy cnml->cpu
  447. // memory copy cpu->mlu
  448. MGB_CNRT_CHECK(cnrtMemcpy(
  449. relu_output_cpu_data.data(), relu_output_mlu_ptr,
  450. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  451. MGB_CNRT_CHECK(cnrtMemcpy(
  452. add_output_cpu_data.data(), add_output_mlu_ptr,
  453. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  454. auto buf = network.get_serialized_model(false);
  455. auto x = std::make_shared<HostTensorND>(
  456. cn, TensorLayout{{ni, ci, hi, wi}, dtype::QuantizedS8{1.f}});
  457. auto add = std::make_shared<HostTensorND>(
  458. cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
  459. std::memcpy(
  460. reinterpret_cast<void*>(x->raw_ptr()), conv_input_cpu_data.data(),
  461. conv_input_count * sizeof(int8_t));
  462. std::memcpy(
  463. reinterpret_cast<void*>(add->ptr<dt_float32>()), add_input_cpu_data.data(),
  464. relu_output_count * sizeof(float));
  465. auto graph = ComputingGraph::make();
  466. auto x_ = opr::Host2DeviceCopy::make(*graph, x);
  467. auto add_ = opr::Host2DeviceCopy::make(*graph, add);
  468. auto outs = opr::MagicMindRuntimeOpr::make(
  469. reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
  470. auto out1 = outs[0];
  471. auto out2 = outs[1];
  472. HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
  473. HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
  474. auto func = graph->compile(
  475. {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
  476. func->execute();
  477. HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
  478. o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
  479. std::memcpy(
  480. o1_mm.ptr<float>(), relu_output_cpu_data.data(),
  481. relu_output_count * sizeof(float));
  482. std::memcpy(
  483. o2_mm.ptr<float>(), add_output_cpu_data.data(),
  484. relu_output_count * sizeof(float));
  485. MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
  486. MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
  487. }
  488. TEST(TestMagicMindRuntimeOpr, GraphShapeMutable) {
  489. REQUIRE_CAMBRICON_DEVICE(1);
  490. auto cn = CompNode::load("cambricon0");
  491. MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
  492. size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32);
  493. auto check = [&](magicmind::Dims input_dim, magicmind::Dims output_dim) {
  494. // prepare parameter for addpad and conv
  495. const int ni = input_dim[0], ci = input_dim[1], hi = input_dim[2],
  496. wi = input_dim[3];
  497. const int no = output_dim[0], co = output_dim[1], ho = output_dim[2],
  498. wo = output_dim[3];
  499. // count tensor nums
  500. int conv_input_count = ni * hi * wi * ci;
  501. int relu_output_count = no * ho * wo * co;
  502. // prepare cpu origin data
  503. std::vector<float> conv_input_cpu_data;
  504. gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
  505. std::vector<float> add_input_cpu_data;
  506. gen_rand_data(add_input_cpu_data, relu_output_count, 256);
  507. std::vector<float> relu_output_cpu_data(relu_output_count);
  508. std::vector<float> add_output_cpu_data(relu_output_count);
  509. auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
  510. void* conv_input_mlu_ptr;
  511. void* add_input_mlu_ptr;
  512. void* relu_output_mlu_ptr;
  513. void* add_output_mlu_ptr;
  514. // malloc mlu mem for fusion input and output
  515. MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
  516. MGB_CNRT_CHECK(
  517. cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
  518. MGB_CNRT_CHECK(
  519. cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
  520. MGB_CNRT_CHECK(
  521. cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
  522. // memory copy cpu->mlu
  523. MGB_CNRT_CHECK(cnrtMemcpy(
  524. conv_input_mlu_ptr, conv_input_cpu_data.data(),
  525. conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
  526. MGB_CNRT_CHECK(cnrtMemcpy(
  527. add_input_mlu_ptr, add_input_cpu_data.data(),
  528. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
  529. std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
  530. conv_input_mlu_ptr, mlu_deleter};
  531. std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
  532. add_input_mlu_ptr, mlu_deleter};
  533. std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
  534. relu_output_mlu_ptr, mlu_deleter};
  535. std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
  536. add_output_mlu_ptr, mlu_deleter};
  537. network.infer_model(
  538. {conv_input_mlu_ptr, add_output_mlu_ptr},
  539. {relu_output_mlu_ptr, add_output_mlu_ptr},
  540. {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
  541. // result memory copy cnml->cpu
  542. // memory copy cpu->mlu
  543. MGB_CNRT_CHECK(cnrtMemcpy(
  544. relu_output_cpu_data.data(), relu_output_mlu_ptr,
  545. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  546. MGB_CNRT_CHECK(cnrtMemcpy(
  547. add_output_cpu_data.data(), add_output_mlu_ptr,
  548. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  549. auto buf = network.get_serialized_model(true);
  550. auto mkshp = [](int n, int c, int h, int w) {
  551. size_t nz = n, cz = c, hz = h, wz = w;
  552. return TensorShape{nz, cz, hz, wz};
  553. };
  554. auto mkly = [](int n, int c, int h, int w, DType dtype) {
  555. size_t nz = n, cz = c, hz = h, wz = w;
  556. return TensorLayout{{nz, cz, hz, wz}, dtype};
  557. };
  558. auto x = std::make_shared<HostTensorND>(
  559. cn, mkly(ni, ci, hi, wi, dtype::Float32()));
  560. auto add = std::make_shared<HostTensorND>(
  561. cn, mkly(no, co, ho, wo, dtype::Float32()));
  562. std::memcpy(
  563. reinterpret_cast<void*>(x->ptr<dt_float32>()),
  564. conv_input_cpu_data.data(), conv_input_count * sizeof(float));
  565. std::memcpy(
  566. reinterpret_cast<void*>(add->ptr<dt_float32>()),
  567. add_input_cpu_data.data(), relu_output_count * sizeof(float));
  568. auto graph = ComputingGraph::make();
  569. auto x_ = opr::Host2DeviceCopy::make(*graph, x);
  570. auto add_ = opr::Host2DeviceCopy::make(*graph, add);
  571. auto outs = opr::MagicMindRuntimeOpr::make(
  572. reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
  573. auto out1 = outs[0];
  574. auto out2 = outs[1];
  575. HostTensorND o1(cn, mkshp(no, co, ho, wo), dtype::Float32());
  576. HostTensorND o2(cn, mkshp(no, co, ho, wo), dtype::Float32());
  577. auto func = graph->compile(
  578. {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
  579. func->execute();
  580. HostTensorND o1_mm(cn, mkshp(no, co, ho, wo), dtype::Float32()),
  581. o2_mm(cn, mkshp(no, co, ho, wo), dtype::Float32());
  582. std::memcpy(
  583. o1_mm.ptr<float>(), relu_output_cpu_data.data(),
  584. relu_output_count * sizeof(float));
  585. std::memcpy(
  586. o2_mm.ptr<float>(), add_output_cpu_data.data(),
  587. relu_output_count * sizeof(float));
  588. MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
  589. MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
  590. };
  591. check(Dims{{1, 64, 32, 32}}, Dims{{1, 64, 32, 32}});
  592. check(Dims{{32, 64, 32, 32}}, Dims{{32, 64, 32, 32}});
  593. check(Dims{{7, 64, 16, 16}}, Dims{{7, 64, 16, 16}});
  594. }
  595. TEST(TestMagicMindRuntimeOpr, Serialization) {
  596. using namespace serialization;
  597. REQUIRE_CAMBRICON_DEVICE(1);
  598. auto cn = CompNode::load("cambricon0");
  599. MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
  600. auto buf = network.get_serialized_model(false);
  601. // prepare parameter for addpad and conv
  602. const int ni = 1, ci = 64, hi = 32, wi = 32;
  603. const int no = 1, co = 64, ho = 32, wo = 32;
  604. auto x = std::make_shared<HostTensorND>(
  605. cn, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
  606. auto add = std::make_shared<HostTensorND>(
  607. cn, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
  608. auto graph = ComputingGraph::make();
  609. auto x_ = opr::Host2DeviceCopy::make(*graph, x);
  610. auto add_ = opr::Host2DeviceCopy::make(*graph, add);
  611. auto outs = opr::MagicMindRuntimeOpr::make(
  612. reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
  613. auto out1 = outs[0];
  614. auto out2 = outs[1];
  615. auto fname = output_file("MagicMindRuntimeOprTest");
  616. auto dump = [&]() {
  617. auto dumper = GraphDumper::make(OutputFile::make_fs(fname.c_str()));
  618. auto rst = dumper->dump({out1, out2});
  619. ASSERT_EQ(rst.outputs.size(), 2u);
  620. };
  621. auto load = [&]() {
  622. auto loader = GraphLoader::make(InputFile::make_fs(fname.c_str()));
  623. auto rst = loader->load();
  624. ASSERT_EQ(rst.output_var_list.size(), 2u);
  625. };
  626. dump();
  627. load();
  628. }
  629. TEST(TestMagicMindRuntimeOpr, Profiling) {
  630. REQUIRE_CAMBRICON_DEVICE(1);
  631. auto cn = CompNode::load("cambricon0");
  632. MMNetwork network(cn, magicmind::DataType::FLOAT32, true);
  633. auto buf = network.get_serialized_model(false);
  634. const int ni = 8, ci = 64, hi = 32, wi = 32;
  635. const int no = 1, co = 64, ho = 32, wo = 32;
  636. HostTensorGenerator<dtype::Float32, RandomDistribution::GAUSSIAN> gen(0, 1);
  637. auto x = gen({ni, ci, hi, wi}, cn);
  638. auto add = gen({no, co, ho, wo}, cn);
  639. auto graph = ComputingGraph::make();
  640. GraphProfiler profiler{graph.get()};
  641. auto x_ = opr::Host2DeviceCopy::make(*graph, x);
  642. auto add_ = opr::Host2DeviceCopy::make(*graph, add);
  643. auto outs = opr::MagicMindRuntimeOpr::make(
  644. reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
  645. auto out1 = outs[0];
  646. auto out2 = outs[1];
  647. graph->options().var_sanity_check_first_run = false;
  648. HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
  649. HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
  650. auto func = graph->compile(
  651. {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
  652. func->execute();
  653. profiler.to_json_full(func.get())
  654. ->writeto_fpath(output_file("magicmind_runtime_opr_profile.json"));
  655. }
  656. TEST(TestMagicMindRuntimeOpr, CrossCNCopy) {
  657. REQUIRE_CAMBRICON_DEVICE(1);
  658. auto cn = CompNode::load("cambricon0");
  659. MMNetwork network(cn, magicmind::DataType::FLOAT32, false);
  660. size_t dtype_size = magicmind::DataTypeSize(magicmind::DataType::FLOAT32);
  661. // prepare parameter for addpad and conv
  662. const int ni = 16, ci = 64, hi = 32, wi = 32;
  663. const int no = 16, co = 64, ho = 32, wo = 32;
  664. // count tensor nums
  665. int conv_input_count = ni * hi * wi * ci;
  666. int relu_output_count = no * ho * wo * co;
  667. // prepare cpu origin data
  668. std::vector<float> conv_input_cpu_data;
  669. gen_rand_data(conv_input_cpu_data, conv_input_count, 256);
  670. std::vector<float> add_input_cpu_data;
  671. gen_rand_data(add_input_cpu_data, relu_output_count, 256);
  672. std::vector<float> relu_output_cpu_data(relu_output_count);
  673. std::vector<float> add_output_cpu_data(relu_output_count);
  674. auto mlu_deleter = [](void* p) { MGB_CNRT_CHECK(cnrtFree(p)); };
  675. void* conv_input_mlu_ptr;
  676. void* add_input_mlu_ptr;
  677. void* relu_output_mlu_ptr;
  678. void* add_output_mlu_ptr;
  679. // malloc mlu mem for fusion input and output
  680. MGB_CNRT_CHECK(cnrtMalloc(&conv_input_mlu_ptr, conv_input_count * dtype_size));
  681. MGB_CNRT_CHECK(cnrtMalloc(&add_input_mlu_ptr, relu_output_count * sizeof(float)));
  682. MGB_CNRT_CHECK(cnrtMalloc(&relu_output_mlu_ptr, relu_output_count * sizeof(float)));
  683. MGB_CNRT_CHECK(cnrtMalloc(&add_output_mlu_ptr, relu_output_count * sizeof(float)));
  684. // memory copy cpu->mlu
  685. MGB_CNRT_CHECK(cnrtMemcpy(
  686. conv_input_mlu_ptr, conv_input_cpu_data.data(),
  687. conv_input_count * dtype_size, CNRT_MEM_TRANS_DIR_HOST2DEV));
  688. MGB_CNRT_CHECK(cnrtMemcpy(
  689. add_input_mlu_ptr, add_input_cpu_data.data(),
  690. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_HOST2DEV));
  691. std::unique_ptr<void, decltype(mlu_deleter)> conv_input_holder{
  692. conv_input_mlu_ptr, mlu_deleter};
  693. std::unique_ptr<void, decltype(mlu_deleter)> add_input_holder{
  694. add_input_mlu_ptr, mlu_deleter};
  695. std::unique_ptr<void, decltype(mlu_deleter)> relu_output_holder{
  696. relu_output_mlu_ptr, mlu_deleter};
  697. std::unique_ptr<void, decltype(mlu_deleter)> add_output_holder{
  698. add_output_mlu_ptr, mlu_deleter};
  699. network.infer_model(
  700. {conv_input_mlu_ptr, add_output_mlu_ptr},
  701. {relu_output_mlu_ptr, add_output_mlu_ptr},
  702. {Dims{{ni, ci, hi, wi}}, Dims{{no, co, ho, wo}}});
  703. // result memory copy cnml->cpu
  704. // memory copy cpu->mlu
  705. MGB_CNRT_CHECK(cnrtMemcpy(
  706. relu_output_cpu_data.data(), relu_output_mlu_ptr,
  707. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  708. MGB_CNRT_CHECK(cnrtMemcpy(
  709. add_output_cpu_data.data(), add_output_mlu_ptr,
  710. relu_output_count * sizeof(float), CNRT_MEM_TRANS_DIR_DEV2HOST));
  711. auto cn_cpu = CompNode::load("cpu0");
  712. auto buf = network.get_serialized_model(false);
  713. auto x = std::make_shared<HostTensorND>(
  714. cn_cpu, TensorLayout{{ni, ci, hi, wi}, dtype::Float32()});
  715. auto add = std::make_shared<HostTensorND>(
  716. cn_cpu, TensorLayout{{no, co, ho, wo}, dtype::Float32()});
  717. std::memcpy(
  718. reinterpret_cast<void*>(x->ptr<dt_float32>()), conv_input_cpu_data.data(),
  719. conv_input_count * sizeof(float));
  720. std::memcpy(
  721. reinterpret_cast<void*>(add->ptr<dt_float32>()), add_input_cpu_data.data(),
  722. relu_output_count * sizeof(float));
  723. auto graph = ComputingGraph::make();
  724. auto x_ = opr::Host2DeviceCopy::make(*graph, x, {cn_cpu});
  725. auto add_ = opr::Host2DeviceCopy::make(*graph, add, {cn_cpu});
  726. x_ = opr::Copy::make(x_, {cn});
  727. add_ = opr::Copy::make(add_, {cn});
  728. auto outs = opr::MagicMindRuntimeOpr::make(
  729. reinterpret_cast<const void*>(buf.data()), buf.size(), {x_, add_});
  730. auto out1 = outs[0];
  731. auto out2 = outs[1];
  732. HostTensorND o1(cn, {no, co, ho, wo}, dtype::Float32());
  733. HostTensorND o2(cn, {no, co, ho, wo}, dtype::Float32());
  734. auto func = graph->compile(
  735. {make_callback_copy(out1, o1), make_callback_copy(out2, o2)});
  736. func->execute();
  737. HostTensorND o1_mm(cn, {no, co, ho, wo}, dtype::Float32()),
  738. o2_mm(cn, {no, co, ho, wo}, dtype::Float32());
  739. std::memcpy(
  740. o1_mm.ptr<float>(), relu_output_cpu_data.data(),
  741. relu_output_count * sizeof(float));
  742. std::memcpy(
  743. o2_mm.ptr<float>(), add_output_cpu_data.data(),
  744. relu_output_count * sizeof(float));
  745. MGB_ASSERT_TENSOR_NEAR(o1, o1_mm, 1e-4);
  746. MGB_ASSERT_TENSOR_NEAR(o2, o2_mm, 1e-4);
  747. }
  748. #endif
  749. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}