You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

basic.cpp 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370
  1. /**
  2. * \file example/basic.cpp
  3. *
  4. * This file is part of MegEngine, a deep learning framework developed by
  5. * Megvii.
  6. *
  7. * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
  8. */
  9. #include <thread>
  10. #include "../example.h"
  11. #if LITE_BUILD_WITH_MGE
  12. #include <cstdio>
  13. #include "misc.h"
  14. using namespace lite;
  15. using namespace example;
  16. namespace {
  17. void output_info(std::shared_ptr<Network> network, size_t output_size) {
  18. for (size_t index = 0; index < output_size; index++) {
  19. printf("output[%zu] names %s \n", index,
  20. network->get_all_output_name()[index].c_str());
  21. std::shared_ptr<Tensor> output_tensor =
  22. network->get_output_tensor(index);
  23. size_t ndim = output_tensor->get_layout().ndim;
  24. for (size_t i = 0; i < ndim; i++) {
  25. printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
  26. output_tensor->get_layout().shapes[i]);
  27. }
  28. }
  29. }
  30. void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
  31. for (size_t index = 0; index < output_size; index++) {
  32. auto output_tensor = network->get_output_tensor(index);
  33. void* out_data = output_tensor->get_memory_ptr();
  34. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  35. output_tensor->get_layout().get_elem_size();
  36. LiteDataType dtype = output_tensor->get_layout().data_type;
  37. float max = -1000.0f;
  38. float min = 1000.0f;
  39. int max_idx = 0;
  40. int min_idx = 0;
  41. float sum = 0.0f;
  42. #define cb(_dtype, _real_dtype) \
  43. case LiteDataType::_dtype: { \
  44. for (size_t i = 0; i < out_length; i++) { \
  45. _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
  46. sum += data; \
  47. if (max < data) { \
  48. max = data; \
  49. max_idx = i; \
  50. } \
  51. if (min > data) { \
  52. min = data; \
  53. min_idx = i; \
  54. } \
  55. } \
  56. } break;
  57. switch (dtype) {
  58. cb(LITE_FLOAT, float);
  59. cb(LITE_INT, int);
  60. cb(LITE_INT8, int8_t);
  61. cb(LITE_UINT8, uint8_t);
  62. default:
  63. printf("unknow datatype");
  64. }
  65. printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n",
  66. out_length, index, max, max_idx, min, min_idx, sum);
  67. }
  68. #undef cb
  69. }
  70. } // namespace
  71. #if LITE_WITH_CUDA
  72. bool lite::example::load_from_path_run_cuda(const Args& args) {
  73. std::string network_path = args.model_path;
  74. std::string input_path = args.input_path;
  75. set_log_level(LiteLogLevel::DEBUG);
  76. //! config the network running in CUDA device
  77. lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
  78. //! set NetworkIO
  79. NetworkIO network_io;
  80. std::string input_name = "img0_comp_fullface";
  81. bool is_host = false;
  82. IO device_input{input_name, is_host};
  83. network_io.inputs.push_back(device_input);
  84. //! create and load the network
  85. std::shared_ptr<Network> network =
  86. std::make_shared<Network>(config, network_io);
  87. network->load_model(network_path);
  88. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  89. Layout input_layout = input_tensor->get_layout();
  90. //! read data from numpy data file
  91. auto src_tensor = parse_npy(input_path);
  92. //! malloc the device memory
  93. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  94. //! copy to the device memory
  95. tensor_device.copy_from(*src_tensor);
  96. //! Now the device memory if filled with user input data, set it to the
  97. //! input tensor
  98. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  99. //! forward
  100. {
  101. lite::Timer ltimer("warmup");
  102. network->forward();
  103. network->wait();
  104. ltimer.print_used_time(0);
  105. }
  106. lite::Timer ltimer("forward_iter");
  107. for (int i = 0; i < 10; i++) {
  108. ltimer.reset_start();
  109. network->forward();
  110. network->wait();
  111. ltimer.print_used_time(i);
  112. }
  113. //! get the output data or read tensor set in network_in
  114. size_t output_size = network->get_all_output_name().size();
  115. output_info(network, output_size);
  116. output_data_info(network, output_size);
  117. return true;
  118. }
  119. #endif
  120. bool lite::example::basic_load_from_path(const Args& args) {
  121. set_log_level(LiteLogLevel::DEBUG);
  122. std::string network_path = args.model_path;
  123. std::string input_path = args.input_path;
  124. //! create and load the network
  125. std::shared_ptr<Network> network = std::make_shared<Network>();
  126. network->load_model(network_path);
  127. //! set input data to input tensor
  128. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  129. auto layout = input_tensor->get_layout();
  130. for (size_t i = 0; i < layout.ndim; i++) {
  131. printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
  132. }
  133. //! copy or forward data to network
  134. size_t length = input_tensor->get_tensor_total_size_in_byte();
  135. void* dst_ptr = input_tensor->get_memory_ptr();
  136. auto src_tensor = parse_npy(input_path);
  137. auto layout0 = src_tensor->get_layout();
  138. for (size_t i = 0; i < layout0.ndim; i++) {
  139. printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
  140. }
  141. void* src = src_tensor->get_memory_ptr();
  142. memcpy(dst_ptr, src, length);
  143. //! forward
  144. {
  145. lite::Timer ltimer("warmup");
  146. network->forward();
  147. network->wait();
  148. ltimer.print_used_time(0);
  149. }
  150. lite::Timer ltimer("forward_iter");
  151. for (int i = 0; i < 10; i++) {
  152. network->forward();
  153. network->wait();
  154. ltimer.print_used_time(i);
  155. }
  156. //! forward
  157. {
  158. lite::Timer ltimer("warmup");
  159. network->forward();
  160. network->wait();
  161. ltimer.print_used_time(0);
  162. }
  163. for (int i = 0; i < 10; i++) {
  164. ltimer.reset_start();
  165. network->forward();
  166. network->wait();
  167. ltimer.print_used_time(i);
  168. }
  169. //! get the output data or read tensor set in network_in
  170. size_t output_size = network->get_all_output_name().size();
  171. output_info(network, output_size);
  172. output_data_info(network, output_size);
  173. return true;
  174. }
  175. bool lite::example::basic_load_from_path_with_loader(const Args& args) {
  176. set_log_level(LiteLogLevel::DEBUG);
  177. lite::set_loader_lib_path(args.loader_path);
  178. std::string network_path = args.model_path;
  179. std::string input_path = args.input_path;
  180. //! create and load the network
  181. std::shared_ptr<Network> network = std::make_shared<Network>();
  182. network->load_model(network_path);
  183. //! set input data to input tensor
  184. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  185. auto input_layout = input_tensor->get_layout();
  186. //! copy or forward data to network
  187. auto src_tensor = parse_npy(input_path);
  188. auto src_layout = src_tensor->get_layout();
  189. if (src_layout.ndim != input_layout.ndim) {
  190. printf("src dim is not equal model input dim\n");
  191. }
  192. //! pay attention the input shape can change
  193. for (size_t i = 0; i < input_layout.ndim; i++) {
  194. if (input_layout.shapes[i] != src_layout.shapes[i]) {
  195. printf("src shape not equal input shape");
  196. }
  197. }
  198. input_tensor->set_layout(src_tensor->get_layout());
  199. //! reset or forward data to network
  200. input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
  201. //! forward
  202. network->forward();
  203. network->wait();
  204. //! forward
  205. {
  206. lite::Timer ltimer("warmup");
  207. network->forward();
  208. network->wait();
  209. ltimer.print_used_time(0);
  210. }
  211. lite::Timer ltimer("forward_iter");
  212. for (int i = 0; i < 10; i++) {
  213. ltimer.reset_start();
  214. network->forward();
  215. network->wait();
  216. ltimer.print_used_time(i);
  217. }
  218. //! get the output data or read tensor set in network_in
  219. size_t output_size = network->get_all_output_name().size();
  220. output_info(network, output_size);
  221. output_data_info(network, output_size);
  222. return true;
  223. }
  224. bool lite::example::basic_load_from_memory(const Args& args) {
  225. std::string network_path = args.model_path;
  226. std::string input_path = args.input_path;
  227. //! create and load the network
  228. std::shared_ptr<Network> network = std::make_shared<Network>();
  229. FILE* fin = fopen(network_path.c_str(), "rb");
  230. if (!fin) {
  231. printf("failed to open %s.", network_path.c_str());
  232. }
  233. fseek(fin, 0, SEEK_END);
  234. size_t size = ftell(fin);
  235. fseek(fin, 0, SEEK_SET);
  236. void* ptr = malloc(size);
  237. std::shared_ptr<void> buf{ptr, ::free};
  238. auto len = fread(buf.get(), 1, size, fin);
  239. if (len < 1) {
  240. printf("read file failed.\n");
  241. }
  242. fclose(fin);
  243. network->load_model(buf.get(), size);
  244. //! set input data to input tensor
  245. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  246. //! copy or forward data to network
  247. size_t length = input_tensor->get_tensor_total_size_in_byte();
  248. void* dst_ptr = input_tensor->get_memory_ptr();
  249. auto src_tensor = parse_npy(input_path);
  250. void* src = src_tensor->get_memory_ptr();
  251. memcpy(dst_ptr, src, length);
  252. //! forward
  253. network->forward();
  254. network->wait();
  255. //! get the output data or read tensor set in network_in
  256. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  257. void* out_data = output_tensor->get_memory_ptr();
  258. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  259. output_tensor->get_layout().get_elem_size();
  260. printf("length=%zu\n", length);
  261. float max = -1.0f;
  262. float sum = 0.0f;
  263. for (size_t i = 0; i < out_length; i++) {
  264. float data = static_cast<float*>(out_data)[i];
  265. sum += data;
  266. if (max < data)
  267. max = data;
  268. }
  269. printf("max=%e, sum=%e\n", max, sum);
  270. return true;
  271. }
  272. bool lite::example::async_forward(const Args& args) {
  273. std::string network_path = args.model_path;
  274. std::string input_path = args.input_path;
  275. Config config;
  276. config.options.var_sanity_check_first_run = false;
  277. //! create and load the network
  278. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  279. network->load_model(network_path);
  280. //! set input data to input tensor
  281. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  282. //! copy or forward data to network
  283. size_t length = input_tensor->get_tensor_total_size_in_byte();
  284. void* dst_ptr = input_tensor->get_memory_ptr();
  285. auto src_tensor = parse_npy(input_path);
  286. void* src = src_tensor->get_memory_ptr();
  287. memcpy(dst_ptr, src, length);
  288. //! set async mode and callback
  289. volatile bool finished = false;
  290. network->set_async_callback([&finished]() {
  291. #if !__DEPLOY_ON_XP_SP2__
  292. std::cout << "worker thread_id:" << std::this_thread::get_id()
  293. << std::endl;
  294. #endif
  295. finished = true;
  296. });
  297. #if !__DEPLOY_ON_XP_SP2__
  298. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  299. #endif
  300. //! forward
  301. network->forward();
  302. size_t count = 0;
  303. while (finished == false) {
  304. count++;
  305. }
  306. printf("Forward finish, count is %zu\n", count);
  307. //! get the output data or read tensor set in network_in
  308. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  309. void* out_data = output_tensor->get_memory_ptr();
  310. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  311. output_tensor->get_layout().get_elem_size();
  312. printf("length=%zu\n", length);
  313. float max = -1.0f;
  314. float sum = 0.0f;
  315. for (size_t i = 0; i < out_length; i++) {
  316. float data = static_cast<float*>(out_data)[i];
  317. sum += data;
  318. if (max < data)
  319. max = data;
  320. }
  321. printf("max=%e, sum=%e\n", max, sum);
  322. return true;
  323. }
  324. #endif
  325. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台