You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

basic.cpp 13 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372
  1. /**
  2. * \file example/basic.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include <thread>
  12. #include "../example.h"
  13. #if LITE_BUILD_WITH_MGE
  14. #include <cstdio>
  15. #include "misc.h"
  16. using namespace lite;
  17. using namespace example;
  18. namespace {
  19. void output_info(std::shared_ptr<Network> network, size_t output_size) {
  20. for (size_t index = 0; index < output_size; index++) {
  21. printf("output[%zu] names %s \n", index,
  22. network->get_all_output_name()[index].c_str());
  23. std::shared_ptr<Tensor> output_tensor =
  24. network->get_output_tensor(index);
  25. size_t ndim = output_tensor->get_layout().ndim;
  26. for (size_t i = 0; i < ndim; i++) {
  27. printf("output[%zu] tensor.shape[%zu] %zu \n", index, i,
  28. output_tensor->get_layout().shapes[i]);
  29. }
  30. }
  31. }
  32. void output_data_info(std::shared_ptr<Network> network, size_t output_size) {
  33. for (size_t index = 0; index < output_size; index++) {
  34. auto output_tensor = network->get_output_tensor(index);
  35. void* out_data = output_tensor->get_memory_ptr();
  36. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  37. output_tensor->get_layout().get_elem_size();
  38. LiteDataType dtype = output_tensor->get_layout().data_type;
  39. float max = -1000.0f;
  40. float min = 1000.0f;
  41. int max_idx = 0;
  42. int min_idx = 0;
  43. float sum = 0.0f;
  44. #define cb(_dtype, _real_dtype) \
  45. case LiteDataType::_dtype: { \
  46. for (size_t i = 0; i < out_length; i++) { \
  47. _real_dtype data = static_cast<_real_dtype*>(out_data)[i]; \
  48. sum += data; \
  49. if (max < data) { \
  50. max = data; \
  51. max_idx = i; \
  52. } \
  53. if (min > data) { \
  54. min = data; \
  55. min_idx = i; \
  56. } \
  57. } \
  58. } break;
  59. switch (dtype) {
  60. cb(LITE_FLOAT, float);
  61. cb(LITE_INT, int);
  62. cb(LITE_INT8, int8_t);
  63. cb(LITE_UINT8, uint8_t);
  64. default:
  65. printf("unknow datatype");
  66. }
  67. printf("output_length %zu index %zu max=%e , max idx=%d, min=%e , min_idx=%d, sum=%e\n",
  68. out_length, index, max, max_idx, min, min_idx, sum);
  69. }
  70. #undef cb
  71. }
  72. } // namespace
  73. #if LITE_WITH_CUDA
  74. bool lite::example::load_from_path_run_cuda(const Args& args) {
  75. std::string network_path = args.model_path;
  76. std::string input_path = args.input_path;
  77. set_log_level(LiteLogLevel::DEBUG);
  78. //! config the network running in CUDA device
  79. lite::Config config{false, -1, LiteDeviceType::LITE_CUDA};
  80. //! set NetworkIO
  81. NetworkIO network_io;
  82. std::string input_name = "img0_comp_fullface";
  83. bool is_host = false;
  84. IO device_input{input_name, is_host};
  85. network_io.inputs.push_back(device_input);
  86. //! create and load the network
  87. std::shared_ptr<Network> network =
  88. std::make_shared<Network>(config, network_io);
  89. network->load_model(network_path);
  90. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  91. Layout input_layout = input_tensor->get_layout();
  92. //! read data from numpy data file
  93. auto src_tensor = parse_npy(input_path);
  94. //! malloc the device memory
  95. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  96. //! copy to the device memory
  97. tensor_device.copy_from(*src_tensor);
  98. //! Now the device memory if filled with user input data, set it to the
  99. //! input tensor
  100. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  101. //! forward
  102. {
  103. lite::Timer ltimer("warmup");
  104. network->forward();
  105. network->wait();
  106. ltimer.print_used_time(0);
  107. }
  108. lite::Timer ltimer("forward_iter");
  109. for (int i = 0; i < 10; i++) {
  110. ltimer.reset_start();
  111. network->forward();
  112. network->wait();
  113. ltimer.print_used_time(i);
  114. }
  115. //! get the output data or read tensor set in network_in
  116. size_t output_size = network->get_all_output_name().size();
  117. output_info(network, output_size);
  118. output_data_info(network, output_size);
  119. return true;
  120. }
  121. #endif
  122. bool lite::example::basic_load_from_path(const Args& args) {
  123. set_log_level(LiteLogLevel::DEBUG);
  124. std::string network_path = args.model_path;
  125. std::string input_path = args.input_path;
  126. //! create and load the network
  127. std::shared_ptr<Network> network = std::make_shared<Network>();
  128. network->load_model(network_path);
  129. //! set input data to input tensor
  130. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  131. auto layout = input_tensor->get_layout();
  132. for (size_t i = 0; i < layout.ndim; i++) {
  133. printf("model input shape[%zu]=%zu \n", i, layout.shapes[i]);
  134. }
  135. //! copy or forward data to network
  136. size_t length = input_tensor->get_tensor_total_size_in_byte();
  137. void* dst_ptr = input_tensor->get_memory_ptr();
  138. auto src_tensor = parse_npy(input_path);
  139. auto layout0 = src_tensor->get_layout();
  140. for (size_t i = 0; i < layout0.ndim; i++) {
  141. printf("src shape[%zu]=%zu \n", i, layout0.shapes[i]);
  142. }
  143. void* src = src_tensor->get_memory_ptr();
  144. memcpy(dst_ptr, src, length);
  145. //! forward
  146. {
  147. lite::Timer ltimer("warmup");
  148. network->forward();
  149. network->wait();
  150. ltimer.print_used_time(0);
  151. }
  152. lite::Timer ltimer("forward_iter");
  153. for (int i = 0; i < 10; i++) {
  154. network->forward();
  155. network->wait();
  156. ltimer.print_used_time(i);
  157. }
  158. //! forward
  159. {
  160. lite::Timer ltimer("warmup");
  161. network->forward();
  162. network->wait();
  163. ltimer.print_used_time(0);
  164. }
  165. for (int i = 0; i < 10; i++) {
  166. ltimer.reset_start();
  167. network->forward();
  168. network->wait();
  169. ltimer.print_used_time(i);
  170. }
  171. //! get the output data or read tensor set in network_in
  172. size_t output_size = network->get_all_output_name().size();
  173. output_info(network, output_size);
  174. output_data_info(network, output_size);
  175. return true;
  176. }
  177. bool lite::example::basic_load_from_path_with_loader(const Args& args) {
  178. set_log_level(LiteLogLevel::DEBUG);
  179. lite::set_loader_lib_path(args.loader_path);
  180. std::string network_path = args.model_path;
  181. std::string input_path = args.input_path;
  182. //! create and load the network
  183. std::shared_ptr<Network> network = std::make_shared<Network>();
  184. network->load_model(network_path);
  185. //! set input data to input tensor
  186. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  187. auto input_layout = input_tensor->get_layout();
  188. //! copy or forward data to network
  189. auto src_tensor = parse_npy(input_path);
  190. auto src_layout = src_tensor->get_layout();
  191. if (src_layout.ndim != input_layout.ndim) {
  192. printf("src dim is not equal model input dim\n");
  193. }
  194. //! pay attention the input shape can change
  195. for (size_t i = 0; i < input_layout.ndim; i++) {
  196. if (input_layout.shapes[i] != src_layout.shapes[i]) {
  197. printf("src shape not equal input shape");
  198. }
  199. }
  200. input_tensor->set_layout(src_tensor->get_layout());
  201. //! reset or forward data to network
  202. input_tensor->reset(src_tensor->get_memory_ptr(), src_tensor->get_layout());
  203. //! forward
  204. network->forward();
  205. network->wait();
  206. //! forward
  207. {
  208. lite::Timer ltimer("warmup");
  209. network->forward();
  210. network->wait();
  211. ltimer.print_used_time(0);
  212. }
  213. lite::Timer ltimer("forward_iter");
  214. for (int i = 0; i < 10; i++) {
  215. ltimer.reset_start();
  216. network->forward();
  217. network->wait();
  218. ltimer.print_used_time(i);
  219. }
  220. //! get the output data or read tensor set in network_in
  221. size_t output_size = network->get_all_output_name().size();
  222. output_info(network, output_size);
  223. output_data_info(network, output_size);
  224. return true;
  225. }
  226. bool lite::example::basic_load_from_memory(const Args& args) {
  227. std::string network_path = args.model_path;
  228. std::string input_path = args.input_path;
  229. //! create and load the network
  230. std::shared_ptr<Network> network = std::make_shared<Network>();
  231. FILE* fin = fopen(network_path.c_str(), "rb");
  232. if (!fin) {
  233. printf("failed to open %s.", network_path.c_str());
  234. }
  235. fseek(fin, 0, SEEK_END);
  236. size_t size = ftell(fin);
  237. fseek(fin, 0, SEEK_SET);
  238. void* ptr = malloc(size);
  239. std::shared_ptr<void> buf{ptr, ::free};
  240. auto len = fread(buf.get(), 1, size, fin);
  241. if (len < 1) {
  242. printf("read file failed.\n");
  243. }
  244. fclose(fin);
  245. network->load_model(buf.get(), size);
  246. //! set input data to input tensor
  247. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  248. //! copy or forward data to network
  249. size_t length = input_tensor->get_tensor_total_size_in_byte();
  250. void* dst_ptr = input_tensor->get_memory_ptr();
  251. auto src_tensor = parse_npy(input_path);
  252. void* src = src_tensor->get_memory_ptr();
  253. memcpy(dst_ptr, src, length);
  254. //! forward
  255. network->forward();
  256. network->wait();
  257. //! get the output data or read tensor set in network_in
  258. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  259. void* out_data = output_tensor->get_memory_ptr();
  260. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  261. output_tensor->get_layout().get_elem_size();
  262. printf("length=%zu\n", length);
  263. float max = -1.0f;
  264. float sum = 0.0f;
  265. for (size_t i = 0; i < out_length; i++) {
  266. float data = static_cast<float*>(out_data)[i];
  267. sum += data;
  268. if (max < data)
  269. max = data;
  270. }
  271. printf("max=%e, sum=%e\n", max, sum);
  272. return true;
  273. }
  274. bool lite::example::async_forward(const Args& args) {
  275. std::string network_path = args.model_path;
  276. std::string input_path = args.input_path;
  277. Config config;
  278. config.options.var_sanity_check_first_run = false;
  279. //! create and load the network
  280. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  281. network->load_model(network_path);
  282. //! set input data to input tensor
  283. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  284. //! copy or forward data to network
  285. size_t length = input_tensor->get_tensor_total_size_in_byte();
  286. void* dst_ptr = input_tensor->get_memory_ptr();
  287. auto src_tensor = parse_npy(input_path);
  288. void* src = src_tensor->get_memory_ptr();
  289. memcpy(dst_ptr, src, length);
  290. //! set async mode and callback
  291. volatile bool finished = false;
  292. network->set_async_callback([&finished]() {
  293. #if !__DEPLOY_ON_XP_SP2__
  294. std::cout << "worker thread_id:" << std::this_thread::get_id()
  295. << std::endl;
  296. #endif
  297. finished = true;
  298. });
  299. #if !__DEPLOY_ON_XP_SP2__
  300. std::cout << "out thread_id:" << std::this_thread::get_id() << std::endl;
  301. #endif
  302. //! forward
  303. network->forward();
  304. size_t count = 0;
  305. while (finished == false) {
  306. count++;
  307. }
  308. printf("Forward finish, count is %zu\n", count);
  309. //! get the output data or read tensor set in network_in
  310. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  311. void* out_data = output_tensor->get_memory_ptr();
  312. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  313. output_tensor->get_layout().get_elem_size();
  314. printf("length=%zu\n", length);
  315. float max = -1.0f;
  316. float sum = 0.0f;
  317. for (size_t i = 0; i < out_length; i++) {
  318. float data = static_cast<float*>(out_data)[i];
  319. sum += data;
  320. if (max < data)
  321. max = data;
  322. }
  323. printf("max=%e, sum=%e\n", max, sum);
  324. return true;
  325. }
  326. #endif
  327. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台