You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_io.cpp 6.5 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191
  1. /**
  2. * \file example/device_io.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include <thread>
  12. #include "../example.h"
  13. #if LITE_BUILD_WITH_MGE
  14. using namespace lite;
  15. using namespace example;
  16. #if LITE_WITH_CUDA
  17. bool lite::example::device_input(const Args& args) {
  18. std::string network_path = args.model_path;
  19. std::string input_path = args.input_path;
  20. //! config the network running in CUDA device
  21. lite::Config config{LiteDeviceType::LITE_CUDA};
  22. //! set NetworkIO
  23. NetworkIO network_io;
  24. std::string input_name = "data";
  25. bool is_host = false;
  26. IO device_input{input_name, is_host};
  27. network_io.inputs.push_back(device_input);
  28. //! create and load the network
  29. std::shared_ptr<Network> network =
  30. std::make_shared<Network>(config, network_io);
  31. network->load_model(network_path);
  32. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  33. Layout input_layout = input_tensor->get_layout();
  34. //! read data from numpy data file
  35. auto src_tensor = parse_npy(input_path);
  36. //! malloc the device memory
  37. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  38. //! copy to the device memory
  39. tensor_device.copy_from(*src_tensor);
  40. //! Now the device memory if filled with user input data, set it to the
  41. //! input tensor
  42. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  43. //! forward
  44. network->forward();
  45. network->wait();
  46. //! get the output data or read tensor set in network_in
  47. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  48. void* out_data = output_tensor->get_memory_ptr();
  49. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  50. output_tensor->get_layout().get_elem_size();
  51. float max = -1.0f;
  52. float sum = 0.0f;
  53. for (size_t i = 0; i < out_length; i++) {
  54. float data = static_cast<float*>(out_data)[i];
  55. sum += data;
  56. if (max < data)
  57. max = data;
  58. }
  59. printf("max=%e, sum=%e\n", max, sum);
  60. return true;
  61. }
  62. bool lite::example::device_input_output(const Args& args) {
  63. std::string network_path = args.model_path;
  64. std::string input_path = args.input_path;
  65. //! config the network running in CUDA device
  66. lite::Config config{LiteDeviceType::LITE_CUDA};
  67. //! set NetworkIO include input and output
  68. NetworkIO network_io;
  69. std::string input_name = "data";
  70. std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
  71. bool is_host = false;
  72. IO device_input{input_name, is_host};
  73. IO device_output{output_name, is_host};
  74. network_io.inputs.push_back(device_input);
  75. network_io.outputs.push_back(device_output);
  76. //! create and load the network
  77. std::shared_ptr<Network> network =
  78. std::make_shared<Network>(config, network_io);
  79. network->load_model(network_path);
  80. std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
  81. Layout input_layout = input_tensor_device->get_layout();
  82. //! read data from numpy data file
  83. auto src_tensor = parse_npy(input_path);
  84. //! malloc the device memory
  85. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  86. //! copy to the device memory
  87. tensor_device.copy_from(*src_tensor);
  88. //! Now the device memory is filled with user input data, set it to the
  89. //! input tensor
  90. input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
  91. //! forward
  92. network->forward();
  93. network->wait();
  94. //! output is in device, should copy it to host
  95. std::shared_ptr<Tensor> output_tensor_device =
  96. network->get_io_tensor(output_name);
  97. auto output_tensor = std::make_shared<Tensor>();
  98. output_tensor->copy_from(*output_tensor_device);
  99. //! get the output data or read tensor set in network_in
  100. void* out_data = output_tensor->get_memory_ptr();
  101. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  102. output_tensor->get_layout().get_elem_size();
  103. float max = -1.0f;
  104. float sum = 0.0f;
  105. for (size_t i = 0; i < out_length; i++) {
  106. float data = static_cast<float*>(out_data)[i];
  107. sum += data;
  108. if (max < data)
  109. max = data;
  110. }
  111. printf("max=%e, sum=%e\n", max, sum);
  112. return true;
  113. }
  114. bool lite::example::pinned_host_input(const Args& args) {
  115. std::string network_path = args.model_path;
  116. std::string input_path = args.input_path;
  117. //! config the network running in CUDA device
  118. lite::Config config{LiteDeviceType::LITE_CUDA};
  119. //! create and load the network
  120. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  121. network->load_model(network_path);
  122. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  123. Layout input_layout = input_tensor->get_layout();
  124. //! read data from numpy data file
  125. auto src_tensor = parse_npy(input_path);
  126. //! malloc the pinned host memory
  127. bool is_pinned_host = true;
  128. auto tensor_pinned_input =
  129. Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
  130. //! copy to the pinned memory
  131. tensor_pinned_input.copy_from(*src_tensor);
  132. //! set the pinned host memory to the network as input
  133. input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
  134. //! forward
  135. network->forward();
  136. network->wait();
  137. //! get the output data or read tensor set in network_in
  138. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  139. void* out_data = output_tensor->get_memory_ptr();
  140. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  141. output_tensor->get_layout().get_elem_size();
  142. float max = -1.0f;
  143. float sum = 0.0f;
  144. for (size_t i = 0; i < out_length; i++) {
  145. float data = static_cast<float*>(out_data)[i];
  146. sum += data;
  147. if (max < data)
  148. max = data;
  149. }
  150. printf("max=%e, sum=%e\n", max, sum);
  151. return true;
  152. }
  153. #endif
  154. #endif
  155. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台