You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

basic.cpp 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506
  1. /**
  2. * \file example/cpp_example/basic.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include <thread>
  12. #include "../example.h"
  13. #if LITE_BUILD_WITH_MGE
  14. #include <cstdio>
  15. #include "misc.h"
  16. using namespace lite;
  17. using namespace example;
  18. namespace {
  19. void output_info(std::shared_ptr<Network> network, size_t output_size) {
  20. for (size_t index = 0; index < output_size; index++) {
  21. printf("output[%zu] names %s \n", index,
  22. network->get_all_output_name()[index].c_str());
  23. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(index);
  24. size_t ndim = output_tensor->get_layout().ndim;
  25. for (size_t i = 0; i < ndim; i++) {
  26. printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
  27. output_tensor->get_layout().shapes[i]);
  28. }
  29. }
  30. }
  31. void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
  32. for (size_t index = 0; index < output_size; index++) {
  33. auto output_tensor = network->get_output_tensor(index);
  34. void* out_data = output_tensor->get_memory_ptr();
  35. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  36. output_tensor->get_layout().get_elem_size();
  37. LiteDataType dtype = output_tensor->get_layout().data_type;
  38. float max = -1000.0f;
  39. float min = 1000.0f;
  40. int max_idx = 0;
  41. int min_idx = 0;
  42. float sum = 0.0f;
  43. #define cb(_dtype, _real_dtype) \
  44. case LiteDataType::_dtype: { \
  45. for (size_t i = 0; i < out_length; i++) { \
  46. _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
  47. sum += data; \
  48. if (max < data) { \
  49. max = data; \
  50. max_idx = i; \
  51. } \
  52. if (min > data) { \
  53. min = data; \
  54. min_idx = i; \
  55. } \
  56. } \
  57. } break;
  58. switch (dtype) {
  59. cb(LITE_FLOAT, float);
  60. cb(LITE_INT, int);
  61. cb(LITE_INT8, int8_t);
  62. cb(LITE_UINT8, uint8_t);
  63. default:
  64. printf("unknow datatype");
  65. }
  66. printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, "
  67. "sum=%e\n",
  68. out_length, index, max, max_idx, min, min_idx, sum);
  69. }
  70. #undef cb
  71. }
  72. } // namespace
  73. #if LITE_WITH_CUDA
  74. bool lite::example::load_from_path_run_cuda(const Args& args) {
  75. std::string network_path = args.model_path;
  76. std::string input_path = args.input_path;
  77. set_log_level(LiteLogLevel::DEBUG);
  78. //! config the network running in CUDA device
  79. lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
  80. //! set NetworkIO
  81. NetworkIO network_io;
  82. std::string input_name = "img0_comp_fullface";
  83. bool is_host = false;
  84. IO device_input{input_name, is_host};
  85. network_io.inputs.push_back(device_input);
  86. //! create and load the network
  87. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  88. network->load_model(network_path);
  89. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  90. Layout input_layout = input_tensor->get_layout();
  91. //! read data from numpy data file
  92. auto src_tensor = parse_npy(input_path);
  93. //! malloc the device memory
  94. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  95. //! copy to the device memory
  96. tensor_device.copy_from(*src_tensor);
  97. //! Now the device memory if filled with user input data, set it to the
  98. //! input tensor
  99. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  100. //! forward
  101. {
  102. lite::Timer ltimer("warmup");
  103. network->forward();
  104. network->wait();
  105. ltimer.print_used_time(0);
  106. }
  107. lite::Timer ltimer("forward_iter");
  108. for (int i = 0; i < 10; i++) {
  109. ltimer.reset_start();
  110. network->forward();
  111. network->wait();
  112. ltimer.print_used_time(i);
  113. }
  114. //! get the output data or read tensor set in network_in
  115. size_t output_size = network->get_all_output_name().size();
  116. output_info(network, output_size);
  117. output_data_info(network, output_size);
  118. return true;
  119. }
  120. #endif
  121. bool lite::example::basic_load_from_path(const Args& args) {
  122. set_log_level(LiteLogLevel::DEBUG);
  123. std::string network_path = args.model_path;
  124. std::string input_path = args.input_path;
  125. //! create and load the network
  126. std::shared_ptr<Network> network = std::make_shared<Network>();
  127. network->load_model(network_path);
  128. //! set input data to input tensor
  129. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  130. auto layout = input_tensor->get_layout();
  131. for (size_t i = 0; i < layout.ndim; i++) {
  132. printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
  133. }
  134. //! copy or forward data to network
  135. size_t length = input_tensor->get_tensor_total_size_in_byte();
  136. void* dst_ptr = input_tensor->get_memory_ptr();
  137. auto src_tensor = parse_npy(input_path);
  138. auto layout0 = src_tensor->get_layout();
  139. for (size_t i = 0; i < layout0.ndim; i++) {
  140. printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
  141. }
  142. void* src = src_tensor->get_memory_ptr();
  143. memcpy(dst_ptr, src, length);
  144. //! forward
  145. {
  146. lite::Timer ltimer("warmup");
  147. network->forward();
  148. network->wait();
  149. ltimer.print_used_time(0);
  150. }
  151. lite::Timer ltimer("forward_iter");
  152. for (int i = 0; i < 10; i++) {
  153. network->forward();
  154. network->wait();
  155. ltimer.print_used_time(i);
  156. }
  157. //! forward
  158. {
  159. lite::Timer ltimer("warmup");
  160. network->forward();
  161. network->wait();
  162. ltimer.print_used_time(0);
  163. }
  164. for (int i = 0; i < 10; i++) {
  165. ltimer.reset_start();
  166. network->forward();
  167. network->wait();
  168. ltimer.print_used_time(i);
  169. }
  170. //! get the output data or read tensor set in network_in
  171. size_t output_size = network->get_all_output_name().size();
  172. output_info(network, output_size);
  173. output_data_info(network, output_size);
  174. return true;
  175. }
  176. bool lite::example::basic_load_from_path_with_loader(const Args& args) {
  177. set_log_level(LiteLogLevel::DEBUG);
  178. lite::set_loader_lib_path(args.loader_path);
  179. std::string network_path = args.model_path;
  180. std::string input_path = args.input_path;
  181. //! create and load the network
  182. std::shared_ptr<Network> network = std::make_shared<Network>();
  183. network->load_model(network_path);
  184. //! set input data to input tensor
  185. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  186. auto input_layout = input_tensor->get_layout();
  187. //! copy or forward data to network
  188. auto src_tensor = parse_npy(input_path);
  189. auto src_layout = src_tensor->get_layout();
  190. if (src_layout.ndim != input_layout.ndim) {
  191. printf("src dim is not equal model input dim\n");
  192. }
  193. //! pay attention the input shape can change
  194. for (size_t i = 0; i < input_layout.ndim; i++) {
  195. if (input_layout.shapes[i] != src_layout.shapes[i]) {
  196. printf("src shape not equal input shape");
  197. }
  198. }
  199. input_tensor->set_layout(src_tensor->get_layout());
  200. //! reset or forward data to network
  201. input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
  202. //! forward
  203. network->forward();
  204. network->wait();
  205. //! forward
  206. {
  207. lite::Timer ltimer("warmup");
  208. network->forward();
  209. network->wait();
  210. ltimer.print_used_time(0);
  211. }
  212. lite::Timer ltimer("forward_iter");
  213. for (int i = 0; i < 10; i++) {
  214. ltimer.reset_start();
  215. network->forward();
  216. network->wait();
  217. ltimer.print_used_time(i);
  218. }
  219. //! get the output data or read tensor set in network_in
  220. size_t output_size = network->get_all_output_name().size();
  221. output_info(network, output_size);
  222. output_data_info(network, output_size);
  223. return true;
  224. }
  225. bool lite::example::basic_load_from_memory(const Args& args) {
  226. std::string network_path = args.model_path;
  227. std::string input_path = args.input_path;
  228. //! create and load the network
  229. std::shared_ptr<Network> network = std::make_shared<Network>();
  230. FILE* fin = fopen(network_path.c_str(), "rb");
  231. if (!fin) {
  232. printf("failed to open %s.", network_path.c_str());
  233. }
  234. fseek(fin, 0, SEEK_END);
  235. size_t size = ftell(fin);
  236. fseek(fin, 0, SEEK_SET);
  237. void* ptr = malloc(size);
  238. std::shared_ptr<void> buf{ptr, ::free};
  239. auto len = fread(buf.get(), 1, size, fin);
  240. if (len < 1) {
  241. printf("read file failed.\n");
  242. }
  243. fclose(fin);
  244. network->load_model(buf.get(), size);
  245. //! set input data to input tensor
  246. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  247. //! copy or forward data to network
  248. size_t length = input_tensor->get_tensor_total_size_in_byte();
  249. void* dst_ptr = input_tensor->get_memory_ptr();
  250. auto src_tensor = parse_npy(input_path);
  251. void* src = src_tensor->get_memory_ptr();
  252. memcpy(dst_ptr, src, length);
  253. //! forward
  254. network->forward();
  255. network->wait();
  256. //! get the output data or read tensor set in network_in
  257. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  258. void* out_data = output_tensor->get_memory_ptr();
  259. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  260. output_tensor->get_layout().get_elem_size();
  261. printf("length=%zu\n", length);
  262. float max = -1.0f;
  263. float sum = 0.0f;
  264. for (size_t i = 0; i < out_length; i++) {
  265. float data = static_cast<float*>(out_data)[i];
  266. sum += data;
  267. if (max < data)
  268. max = data;
  269. }
  270. printf("max=%e, sum=%e\n", max, sum);
  271. return true;
  272. }
  273. bool lite::example::async_forward(const Args& args) {
  274. std::string network_path = args.model_path;
  275. std::string input_path = args.input_path;
  276. Config config;
  277. config.options.var_sanity_check_first_run = false;
  278. //! create and load the network
  279. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  280. network->load_model(network_path);
  281. //! set input data to input tensor
  282. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  283. //! copy or forward data to network
  284. size_t length = input_tensor->get_tensor_total_size_in_byte();
  285. void* dst_ptr = input_tensor->get_memory_ptr();
  286. auto src_tensor = parse_npy(input_path);
  287. void* src = src_tensor->get_memory_ptr();
  288. memcpy(dst_ptr, src, length);
  289. //! set async mode and callback
  290. volatile bool finished = false;
  291. network->set_async_callback([&finished]() {
  292. #if !__DEPLOY_ON_XP_SP2__
  293. std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl;
  294. #endif
  295. finished = true;
  296. });
  297. #if !__DEPLOY_ON_XP_SP2__
  298. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  299. #endif
  300. //! forward
  301. network->forward();
  302. size_t count = 0;
  303. while (finished == false) {
  304. count++;
  305. }
  306. printf("Forward finish, count is %zu\n", count);
  307. //! get the output data or read tensor set in network_in
  308. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  309. void* out_data = output_tensor->get_memory_ptr();
  310. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  311. output_tensor->get_layout().get_elem_size();
  312. printf("length=%zu\n", length);
  313. float max = -1.0f;
  314. float sum = 0.0f;
  315. for (size_t i = 0; i < out_length; i++) {
  316. float data = static_cast<float*>(out_data)[i];
  317. sum += data;
  318. if (max < data)
  319. max = data;
  320. }
  321. printf("max=%e, sum=%e\n", max, sum);
  322. return true;
  323. }
  324. bool lite::example::set_input_callback(const Args& args) {
  325. std::string network_path = args.model_path;
  326. std::string input_path = args.input_path;
  327. Config config;
  328. config.options.var_sanity_check_first_run = false;
  329. //! create and load the network
  330. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  331. network->load_model(network_path);
  332. //! set input data to input tensor
  333. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  334. //! copy or forward data to network
  335. size_t length = input_tensor->get_tensor_total_size_in_byte();
  336. void* dst_ptr = input_tensor->get_memory_ptr();
  337. auto src_tensor = parse_npy(input_path);
  338. void* src = src_tensor->get_memory_ptr();
  339. memcpy(dst_ptr, src, length);
  340. //! set input callback
  341. volatile bool finished = false;
  342. network->set_start_callback(
  343. [&finished](const std::unordered_map<
  344. std::string, std::pair<IO, std::shared_ptr<Tensor>>>& inputs) {
  345. #if !__DEPLOY_ON_XP_SP2__
  346. std::cout << "worker thread_id:" << std::this_thread::get_id()
  347. << std::endl;
  348. #endif
  349. for (auto&& item : inputs) {
  350. std::cout << "input name: " << item.first
  351. << "input dim: " << item.second.second->get_layout().ndim
  352. << std::endl;
  353. }
  354. finished = true;
  355. });
  356. #if !__DEPLOY_ON_XP_SP2__
  357. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  358. #endif
  359. //! forward
  360. network->forward();
  361. size_t count = 0;
  362. while (finished == false) {
  363. count++;
  364. }
  365. printf("Forward finish, count is %zu\n", count);
  366. //! get the output data or read tensor set in network_in
  367. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  368. void* out_data = output_tensor->get_memory_ptr();
  369. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  370. output_tensor->get_layout().get_elem_size();
  371. printf("length=%zu\n", length);
  372. float max = -1.0f;
  373. float sum = 0.0f;
  374. for (size_t i = 0; i < out_length; i++) {
  375. float data = static_cast<float*>(out_data)[i];
  376. sum += data;
  377. if (max < data)
  378. max = data;
  379. }
  380. printf("max=%e, sum=%e\n", max, sum);
  381. return true;
  382. }
  383. bool lite::example::set_output_callback(const Args& args) {
  384. std::string network_path = args.model_path;
  385. std::string input_path = args.input_path;
  386. Config config;
  387. config.options.var_sanity_check_first_run = false;
  388. //! create and load the network
  389. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  390. network->load_model(network_path);
  391. //! set input data to input tensor
  392. std::shared_ptr<Tensor> input_tensor = network->get_output_tensor(0);
  393. //! copy or forward data to network
  394. size_t length = input_tensor->get_tensor_total_size_in_byte();
  395. void* dst_ptr = input_tensor->get_memory_ptr();
  396. auto src_tensor = parse_npy(input_path);
  397. void* src = src_tensor->get_memory_ptr();
  398. memcpy(dst_ptr, src, length);
  399. //! set output callback
  400. volatile bool finished = false;
  401. network->set_finish_callback(
  402. [&finished](const std::unordered_map<
  403. std::string, std::pair<IO, std::shared_ptr<Tensor>>>& outputs) {
  404. #if !__DEPLOY_ON_XP_SP2__
  405. std::cout << "worker thread_id:" << std::this_thread::get_id()
  406. << std::endl;
  407. #endif
  408. for (auto&& item : outputs) {
  409. std::cout << "output name: " << item.first
  410. << "output dim: " << item.second.second->get_layout().ndim
  411. << std::endl;
  412. }
  413. finished = true;
  414. });
  415. #if !__DEPLOY_ON_XP_SP2__
  416. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  417. #endif
  418. //! forward
  419. network->forward();
  420. network->wait();
  421. size_t count = 0;
  422. while (finished == false) {
  423. count++;
  424. }
  425. printf("Forward finish, count is %zu\n", count);
  426. //! get the output data or read tensor set in network_in
  427. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  428. void* out_data = output_tensor->get_memory_ptr();
  429. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  430. output_tensor->get_layout().get_elem_size();
  431. printf("length=%zu\n", length);
  432. float max = -1.0f;
  433. float sum = 0.0f;
  434. for (size_t i = 0; i < out_length; i++) {
  435. float data = static_cast<float*>(out_data)[i];
  436. sum += data;
  437. if (max < data)
  438. max = data;
  439. }
  440. printf("max=%e, sum=%e\n", max, sum);
  441. return true;
  442. }
  443. #endif
  444. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台