You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_io.cpp 6.3 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189
  1. /**
  2. * \file example/device_io.cpp
  3. *
  4. * This file is part of MegEngine, a deep learning framework developed by
  5. * Megvii.
  6. *
  7. * \copyright Copyright (c) 2020-2021 Megvii Inc. All rights reserved.
  8. */
  9. #include <thread>
  10. #include "../example.h"
  11. #if LITE_BUILD_WITH_MGE
  12. using namespace lite;
  13. using namespace example;
  14. #if LITE_WITH_CUDA
  15. bool lite::example::device_input(const Args& args) {
  16. std::string network_path = args.model_path;
  17. std::string input_path = args.input_path;
  18. //! config the network running in CUDA device
  19. lite::Config config{LiteDeviceType::LITE_CUDA};
  20. //! set NetworkIO
  21. NetworkIO network_io;
  22. std::string input_name = "data";
  23. bool is_host = false;
  24. IO device_input{input_name, is_host};
  25. network_io.inputs.push_back(device_input);
  26. //! create and load the network
  27. std::shared_ptr<Network> network =
  28. std::make_shared<Network>(config, network_io);
  29. network->load_model(network_path);
  30. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  31. Layout input_layout = input_tensor->get_layout();
  32. //! read data from numpy data file
  33. auto src_tensor = parse_npy(input_path);
  34. //! malloc the device memory
  35. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  36. //! copy to the device memory
  37. tensor_device.copy_from(*src_tensor);
  38. //! Now the device memory if filled with user input data, set it to the
  39. //! input tensor
  40. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  41. //! forward
  42. network->forward();
  43. network->wait();
  44. //! get the output data or read tensor set in network_in
  45. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  46. void* out_data = output_tensor->get_memory_ptr();
  47. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  48. output_tensor->get_layout().get_elem_size();
  49. float max = -1.0f;
  50. float sum = 0.0f;
  51. for (size_t i = 0; i < out_length; i++) {
  52. float data = static_cast<float*>(out_data)[i];
  53. sum += data;
  54. if (max < data)
  55. max = data;
  56. }
  57. printf("max=%e, sum=%e\n", max, sum);
  58. return true;
  59. }
  60. bool lite::example::device_input_output(const Args& args) {
  61. std::string network_path = args.model_path;
  62. std::string input_path = args.input_path;
  63. //! config the network running in CUDA device
  64. lite::Config config{LiteDeviceType::LITE_CUDA};
  65. //! set NetworkIO include input and output
  66. NetworkIO network_io;
  67. std::string input_name = "data";
  68. std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
  69. bool is_host = false;
  70. IO device_input{input_name, is_host};
  71. IO device_output{output_name, is_host};
  72. network_io.inputs.push_back(device_input);
  73. network_io.outputs.push_back(device_output);
  74. //! create and load the network
  75. std::shared_ptr<Network> network =
  76. std::make_shared<Network>(config, network_io);
  77. network->load_model(network_path);
  78. std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
  79. Layout input_layout = input_tensor_device->get_layout();
  80. //! read data from numpy data file
  81. auto src_tensor = parse_npy(input_path);
  82. //! malloc the device memory
  83. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  84. //! copy to the device memory
  85. tensor_device.copy_from(*src_tensor);
  86. //! Now the device memory is filled with user input data, set it to the
  87. //! input tensor
  88. input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
  89. //! forward
  90. network->forward();
  91. network->wait();
  92. //! output is in device, should copy it to host
  93. std::shared_ptr<Tensor> output_tensor_device =
  94. network->get_io_tensor(output_name);
  95. auto output_tensor = std::make_shared<Tensor>();
  96. output_tensor->copy_from(*output_tensor_device);
  97. //! get the output data or read tensor set in network_in
  98. void* out_data = output_tensor->get_memory_ptr();
  99. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  100. output_tensor->get_layout().get_elem_size();
  101. float max = -1.0f;
  102. float sum = 0.0f;
  103. for (size_t i = 0; i < out_length; i++) {
  104. float data = static_cast<float*>(out_data)[i];
  105. sum += data;
  106. if (max < data)
  107. max = data;
  108. }
  109. printf("max=%e, sum=%e\n", max, sum);
  110. return true;
  111. }
  112. bool lite::example::pinned_host_input(const Args& args) {
  113. std::string network_path = args.model_path;
  114. std::string input_path = args.input_path;
  115. //! config the network running in CUDA device
  116. lite::Config config{LiteDeviceType::LITE_CUDA};
  117. //! create and load the network
  118. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  119. network->load_model(network_path);
  120. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  121. Layout input_layout = input_tensor->get_layout();
  122. //! read data from numpy data file
  123. auto src_tensor = parse_npy(input_path);
  124. //! malloc the pinned host memory
  125. bool is_pinned_host = true;
  126. auto tensor_pinned_input =
  127. Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
  128. //! copy to the pinned memory
  129. tensor_pinned_input.copy_from(*src_tensor);
  130. //! set the pinned host memory to the network as input
  131. input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
  132. //! forward
  133. network->forward();
  134. network->wait();
  135. //! get the output data or read tensor set in network_in
  136. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  137. void* out_data = output_tensor->get_memory_ptr();
  138. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  139. output_tensor->get_layout().get_elem_size();
  140. float max = -1.0f;
  141. float sum = 0.0f;
  142. for (size_t i = 0; i < out_length; i++) {
  143. float data = static_cast<float*>(out_data)[i];
  144. sum += data;
  145. if (max < data)
  146. max = data;
  147. }
  148. printf("max=%e, sum=%e\n", max, sum);
  149. return true;
  150. }
  151. #endif
  152. #endif
  153. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台