You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

basic.cpp 18 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519
  1. /**
  2. * \file example/cpp_example/basic.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include <thread>
  12. #include "example.h"
  13. #if LITE_BUILD_WITH_MGE
  14. #include <cstdio>
  15. #include "misc.h"
  16. using namespace lite;
  17. using namespace example;
  18. namespace {
  19. void output_info(std::shared_ptr<Network> network, size_t output_size) {
  20. for (size_t index = 0; index < output_size; index++) {
  21. printf("output[%zu] names %s \n", index,
  22. network->get_all_output_name()[index].c_str());
  23. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(index);
  24. size_t ndim = output_tensor->get_layout().ndim;
  25. for (size_t i = 0; i < ndim; i++) {
  26. printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
  27. output_tensor->get_layout().shapes[i]);
  28. }
  29. }
  30. }
  31. void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
  32. for (size_t index = 0; index < output_size; index++) {
  33. auto output_tensor = network->get_output_tensor(index);
  34. void* out_data = output_tensor->get_memory_ptr();
  35. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  36. output_tensor->get_layout().get_elem_size();
  37. LiteDataType dtype = output_tensor->get_layout().data_type;
  38. float max = -1000.0f;
  39. float min = 1000.0f;
  40. int max_idx = 0;
  41. int min_idx = 0;
  42. float sum = 0.0f;
  43. #define cb(_dtype, _real_dtype) \
  44. case LiteDataType::_dtype: { \
  45. for (size_t i = 0; i < out_length; i++) { \
  46. _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
  47. sum += data; \
  48. if (max < data) { \
  49. max = data; \
  50. max_idx = i; \
  51. } \
  52. if (min > data) { \
  53. min = data; \
  54. min_idx = i; \
  55. } \
  56. } \
  57. } break;
  58. switch (dtype) {
  59. cb(LITE_FLOAT, float);
  60. cb(LITE_INT, int);
  61. cb(LITE_INT8, int8_t);
  62. cb(LITE_UINT8, uint8_t);
  63. default:
  64. printf("unknow datatype");
  65. }
  66. printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, "
  67. "sum=%e\n",
  68. out_length, index, max, max_idx, min, min_idx, sum);
  69. }
  70. #undef cb
  71. }
  72. } // namespace
  73. namespace {
  74. bool basic_load_from_path(const Args& args) {
  75. set_log_level(LiteLogLevel::DEBUG);
  76. std::string network_path = args.model_path;
  77. std::string input_path = args.input_path;
  78. //! create and load the network
  79. std::shared_ptr<Network> network = std::make_shared<Network>();
  80. network->load_model(network_path);
  81. //! set input data to input tensor
  82. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  83. auto layout = input_tensor->get_layout();
  84. for (size_t i = 0; i < layout.ndim; i++) {
  85. printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
  86. }
  87. //! copy or forward data to network
  88. size_t length = input_tensor->get_tensor_total_size_in_byte();
  89. void* dst_ptr = input_tensor->get_memory_ptr();
  90. auto src_tensor = parse_npy(input_path);
  91. auto layout0 = src_tensor->get_layout();
  92. for (size_t i = 0; i < layout0.ndim; i++) {
  93. printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
  94. }
  95. void* src = src_tensor->get_memory_ptr();
  96. memcpy(dst_ptr, src, length);
  97. //! forward
  98. {
  99. lite::Timer ltimer("warmup");
  100. network->forward();
  101. network->wait();
  102. ltimer.print_used_time(0);
  103. }
  104. lite::Timer ltimer("forward_iter");
  105. for (int i = 0; i < 10; i++) {
  106. network->forward();
  107. network->wait();
  108. ltimer.print_used_time(i);
  109. }
  110. //! forward
  111. {
  112. lite::Timer ltimer("warmup");
  113. network->forward();
  114. network->wait();
  115. ltimer.print_used_time(0);
  116. }
  117. for (int i = 0; i < 10; i++) {
  118. ltimer.reset_start();
  119. network->forward();
  120. network->wait();
  121. ltimer.print_used_time(i);
  122. }
  123. //! get the output data or read tensor set in network_in
  124. size_t output_size = network->get_all_output_name().size();
  125. output_info(network, output_size);
  126. output_data_info(network, output_size);
  127. return true;
  128. }
  129. bool basic_load_from_path_with_loader(const Args& args) {
  130. set_log_level(LiteLogLevel::DEBUG);
  131. lite::set_loader_lib_path(args.loader_path);
  132. std::string network_path = args.model_path;
  133. std::string input_path = args.input_path;
  134. //! create and load the network
  135. std::shared_ptr<Network> network = std::make_shared<Network>();
  136. network->load_model(network_path);
  137. //! set input data to input tensor
  138. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  139. auto input_layout = input_tensor->get_layout();
  140. //! copy or forward data to network
  141. auto src_tensor = parse_npy(input_path);
  142. auto src_layout = src_tensor->get_layout();
  143. if (src_layout.ndim != input_layout.ndim) {
  144. printf("src dim is not equal model input dim\n");
  145. }
  146. //! pay attention the input shape can change
  147. for (size_t i = 0; i < input_layout.ndim; i++) {
  148. if (input_layout.shapes[i] != src_layout.shapes[i]) {
  149. printf("src shape not equal input shape");
  150. }
  151. }
  152. input_tensor->set_layout(src_tensor->get_layout());
  153. //! reset or forward data to network
  154. input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
  155. //! forward
  156. network->forward();
  157. network->wait();
  158. //! forward
  159. {
  160. lite::Timer ltimer("warmup");
  161. network->forward();
  162. network->wait();
  163. ltimer.print_used_time(0);
  164. }
  165. lite::Timer ltimer("forward_iter");
  166. for (int i = 0; i < 10; i++) {
  167. ltimer.reset_start();
  168. network->forward();
  169. network->wait();
  170. ltimer.print_used_time(i);
  171. }
  172. //! get the output data or read tensor set in network_in
  173. size_t output_size = network->get_all_output_name().size();
  174. output_info(network, output_size);
  175. output_data_info(network, output_size);
  176. return true;
  177. }
  178. bool basic_load_from_memory(const Args& args) {
  179. std::string network_path = args.model_path;
  180. std::string input_path = args.input_path;
  181. //! create and load the network
  182. std::shared_ptr<Network> network = std::make_shared<Network>();
  183. FILE* fin = fopen(network_path.c_str(), "rb");
  184. if (!fin) {
  185. printf("failed to open %s.", network_path.c_str());
  186. }
  187. fseek(fin, 0, SEEK_END);
  188. size_t size = ftell(fin);
  189. fseek(fin, 0, SEEK_SET);
  190. void* ptr = malloc(size);
  191. std::shared_ptr<void> buf{ptr, ::free};
  192. auto len = fread(buf.get(), 1, size, fin);
  193. if (len < 1) {
  194. printf("read file failed.\n");
  195. }
  196. fclose(fin);
  197. network->load_model(buf.get(), size);
  198. //! set input data to input tensor
  199. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  200. //! copy or forward data to network
  201. size_t length = input_tensor->get_tensor_total_size_in_byte();
  202. void* dst_ptr = input_tensor->get_memory_ptr();
  203. auto src_tensor = parse_npy(input_path);
  204. void* src = src_tensor->get_memory_ptr();
  205. memcpy(dst_ptr, src, length);
  206. //! forward
  207. network->forward();
  208. network->wait();
  209. //! get the output data or read tensor set in network_in
  210. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  211. void* out_data = output_tensor->get_memory_ptr();
  212. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  213. output_tensor->get_layout().get_elem_size();
  214. printf("length=%zu\n", length);
  215. float max = -1.0f;
  216. float sum = 0.0f;
  217. for (size_t i = 0; i < out_length; i++) {
  218. float data = static_cast<float*>(out_data)[i];
  219. sum += data;
  220. if (max < data)
  221. max = data;
  222. }
  223. printf("max=%e, sum=%e\n", max, sum);
  224. return true;
  225. }
  226. bool async_forward(const Args& args) {
  227. std::string network_path = args.model_path;
  228. std::string input_path = args.input_path;
  229. Config config;
  230. config.options.var_sanity_check_first_run = false;
  231. //! create and load the network
  232. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  233. network->load_model(network_path);
  234. //! set input data to input tensor
  235. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  236. //! copy or forward data to network
  237. size_t length = input_tensor->get_tensor_total_size_in_byte();
  238. void* dst_ptr = input_tensor->get_memory_ptr();
  239. auto src_tensor = parse_npy(input_path);
  240. void* src = src_tensor->get_memory_ptr();
  241. memcpy(dst_ptr, src, length);
  242. //! set async mode and callback
  243. volatile bool finished = false;
  244. network->set_async_callback([&finished]() {
  245. #if !__DEPLOY_ON_XP_SP2__
  246. std::cout << "worker thread_id:" << std::this_thread::get_id() << std::endl;
  247. #endif
  248. finished = true;
  249. });
  250. #if !__DEPLOY_ON_XP_SP2__
  251. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  252. #endif
  253. //! forward
  254. network->forward();
  255. size_t count = 0;
  256. while (finished == false) {
  257. count++;
  258. }
  259. printf("Forward finish, count is %zu\n", count);
  260. //! get the output data or read tensor set in network_in
  261. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  262. void* out_data = output_tensor->get_memory_ptr();
  263. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  264. output_tensor->get_layout().get_elem_size();
  265. printf("length=%zu\n", length);
  266. float max = -1.0f;
  267. float sum = 0.0f;
  268. for (size_t i = 0; i < out_length; i++) {
  269. float data = static_cast<float*>(out_data)[i];
  270. sum += data;
  271. if (max < data)
  272. max = data;
  273. }
  274. printf("max=%e, sum=%e\n", max, sum);
  275. return true;
  276. }
  277. bool set_input_callback(const Args& args) {
  278. std::string network_path = args.model_path;
  279. std::string input_path = args.input_path;
  280. Config config;
  281. config.options.var_sanity_check_first_run = false;
  282. //! create and load the network
  283. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  284. network->load_model(network_path);
  285. //! set input data to input tensor
  286. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  287. //! copy or forward data to network
  288. size_t length = input_tensor->get_tensor_total_size_in_byte();
  289. void* dst_ptr = input_tensor->get_memory_ptr();
  290. auto src_tensor = parse_npy(input_path);
  291. void* src = src_tensor->get_memory_ptr();
  292. memcpy(dst_ptr, src, length);
  293. //! set input callback
  294. volatile bool finished = false;
  295. network->set_start_callback(
  296. [&finished](const std::unordered_map<
  297. std::string, std::pair<IO, std::shared_ptr<Tensor>>>& inputs) {
  298. #if !__DEPLOY_ON_XP_SP2__
  299. std::cout << "worker thread_id:" << std::this_thread::get_id()
  300. << std::endl;
  301. #endif
  302. for (auto&& item : inputs) {
  303. std::cout << "input name: " << item.first
  304. << "input dim: " << item.second.second->get_layout().ndim
  305. << std::endl;
  306. }
  307. finished = true;
  308. });
  309. #if !__DEPLOY_ON_XP_SP2__
  310. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  311. #endif
  312. //! forward
  313. network->forward();
  314. size_t count = 0;
  315. while (finished == false) {
  316. count++;
  317. }
  318. printf("Forward finish, count is %zu\n", count);
  319. //! get the output data or read tensor set in network_in
  320. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  321. void* out_data = output_tensor->get_memory_ptr();
  322. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  323. output_tensor->get_layout().get_elem_size();
  324. printf("length=%zu\n", length);
  325. float max = -1.0f;
  326. float sum = 0.0f;
  327. for (size_t i = 0; i < out_length; i++) {
  328. float data = static_cast<float*>(out_data)[i];
  329. sum += data;
  330. if (max < data)
  331. max = data;
  332. }
  333. printf("max=%e, sum=%e\n", max, sum);
  334. return true;
  335. }
  336. bool set_output_callback(const Args& args) {
  337. std::string network_path = args.model_path;
  338. std::string input_path = args.input_path;
  339. Config config;
  340. config.options.var_sanity_check_first_run = false;
  341. //! create and load the network
  342. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  343. network->load_model(network_path);
  344. //! set input data to input tensor
  345. std::shared_ptr<Tensor> input_tensor = network->get_output_tensor(0);
  346. //! copy or forward data to network
  347. size_t length = input_tensor->get_tensor_total_size_in_byte();
  348. void* dst_ptr = input_tensor->get_memory_ptr();
  349. auto src_tensor = parse_npy(input_path);
  350. void* src = src_tensor->get_memory_ptr();
  351. memcpy(dst_ptr, src, length);
  352. //! set output callback
  353. volatile bool finished = false;
  354. network->set_finish_callback(
  355. [&finished](const std::unordered_map<
  356. std::string, std::pair<IO, std::shared_ptr<Tensor>>>& outputs) {
  357. #if !__DEPLOY_ON_XP_SP2__
  358. std::cout << "worker thread_id:" << std::this_thread::get_id()
  359. << std::endl;
  360. #endif
  361. for (auto&& item : outputs) {
  362. std::cout << "output name: " << item.first
  363. << "output dim: " << item.second.second->get_layout().ndim
  364. << std::endl;
  365. }
  366. finished = true;
  367. });
  368. #if !__DEPLOY_ON_XP_SP2__
  369. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  370. #endif
  371. //! forward
  372. network->forward();
  373. network->wait();
  374. size_t count = 0;
  375. while (finished == false) {
  376. count++;
  377. }
  378. printf("Forward finish, count is %zu\n", count);
  379. //! get the output data or read tensor set in network_in
  380. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  381. void* out_data = output_tensor->get_memory_ptr();
  382. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  383. output_tensor->get_layout().get_elem_size();
  384. printf("length=%zu\n", length);
  385. float max = -1.0f;
  386. float sum = 0.0f;
  387. for (size_t i = 0; i < out_length; i++) {
  388. float data = static_cast<float*>(out_data)[i];
  389. sum += data;
  390. if (max < data)
  391. max = data;
  392. }
  393. printf("max=%e, sum=%e\n", max, sum);
  394. return true;
  395. }
  396. } // namespace
  397. REGIST_EXAMPLE("basic_load_from_path", basic_load_from_path);
  398. REGIST_EXAMPLE("basic_load_from_path_with_loader", basic_load_from_path_with_loader);
  399. REGIST_EXAMPLE("basic_load_from_memory", basic_load_from_memory);
  400. REGIST_EXAMPLE("async_forward", async_forward);
  401. REGIST_EXAMPLE("set_input_callback", set_input_callback);
  402. REGIST_EXAMPLE("set_output_callback", set_output_callback);
  403. #if LITE_WITH_CUDA
  404. namespace {
  405. bool load_from_path_run_cuda(const Args& args) {
  406. std::string network_path = args.model_path;
  407. std::string input_path = args.input_path;
  408. set_log_level(LiteLogLevel::DEBUG);
  409. //! config the network running in CUDA device
  410. lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
  411. //! set NetworkIO
  412. NetworkIO network_io;
  413. std::string input_name = "img0_comp_fullface";
  414. bool is_host = false;
  415. IO device_input{input_name, is_host};
  416. network_io.inputs.push_back(device_input);
  417. //! create and load the network
  418. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  419. network->load_model(network_path);
  420. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  421. Layout input_layout = input_tensor->get_layout();
  422. //! read data from numpy data file
  423. auto src_tensor = parse_npy(input_path);
  424. //! malloc the device memory
  425. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  426. //! copy to the device memory
  427. tensor_device.copy_from(*src_tensor);
  428. //! Now the device memory if filled with user input data, set it to the
  429. //! input tensor
  430. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  431. //! forward
  432. {
  433. lite::Timer ltimer("warmup");
  434. network->forward();
  435. network->wait();
  436. ltimer.print_used_time(0);
  437. }
  438. lite::Timer ltimer("forward_iter");
  439. for (int i = 0; i < 10; i++) {
  440. ltimer.reset_start();
  441. network->forward();
  442. network->wait();
  443. ltimer.print_used_time(i);
  444. }
  445. //! get the output data or read tensor set in network_in
  446. size_t output_size = network->get_all_output_name().size();
  447. output_info(network, output_size);
  448. output_data_info(network, output_size);
  449. return true;
  450. }
  451. } // namespace
  452. REGIST_EXAMPLE("load_from_path_run_cuda", load_from_path_run_cuda);
  453. #endif
  454. #endif
  455. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}