You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_io.cpp 6.4 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188
  1. /**
  2. * \file example/cpp_example/device_io.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include <thread>
  12. #include "../example.h"
  13. #if LITE_BUILD_WITH_MGE
  14. using namespace lite;
  15. using namespace example;
  16. #if LITE_WITH_CUDA
  17. bool lite::example::device_input(const Args& args) {
  18. std::string network_path = args.model_path;
  19. std::string input_path = args.input_path;
  20. //! config the network running in CUDA device
  21. lite::Config config{LiteDeviceType::LITE_CUDA};
  22. //! set NetworkIO
  23. NetworkIO network_io;
  24. std::string input_name = "data";
  25. bool is_host = false;
  26. IO device_input{input_name, is_host};
  27. network_io.inputs.push_back(device_input);
  28. //! create and load the network
  29. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  30. network->load_model(network_path);
  31. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  32. Layout input_layout = input_tensor->get_layout();
  33. //! read data from numpy data file
  34. auto src_tensor = parse_npy(input_path);
  35. //! malloc the device memory
  36. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  37. //! copy to the device memory
  38. tensor_device.copy_from(*src_tensor);
  39. //! Now the device memory if filled with user input data, set it to the
  40. //! input tensor
  41. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  42. //! forward
  43. network->forward();
  44. network->wait();
  45. //! get the output data or read tensor set in network_in
  46. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  47. void* out_data = output_tensor->get_memory_ptr();
  48. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  49. output_tensor->get_layout().get_elem_size();
  50. float max = -1.0f;
  51. float sum = 0.0f;
  52. for (size_t i = 0; i < out_length; i++) {
  53. float data = static_cast<float*>(out_data)[i];
  54. sum += data;
  55. if (max < data)
  56. max = data;
  57. }
  58. printf("max=%e, sum=%e\n", max, sum);
  59. return true;
  60. }
  61. bool lite::example::device_input_output(const Args& args) {
  62. std::string network_path = args.model_path;
  63. std::string input_path = args.input_path;
  64. //! config the network running in CUDA device
  65. lite::Config config{LiteDeviceType::LITE_CUDA};
  66. //! set NetworkIO include input and output
  67. NetworkIO network_io;
  68. std::string input_name = "data";
  69. std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
  70. bool is_host = false;
  71. IO device_input{input_name, is_host};
  72. IO device_output{output_name, is_host};
  73. network_io.inputs.push_back(device_input);
  74. network_io.outputs.push_back(device_output);
  75. //! create and load the network
  76. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  77. network->load_model(network_path);
  78. std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
  79. Layout input_layout = input_tensor_device->get_layout();
  80. //! read data from numpy data file
  81. auto src_tensor = parse_npy(input_path);
  82. //! malloc the device memory
  83. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  84. //! copy to the device memory
  85. tensor_device.copy_from(*src_tensor);
  86. //! Now the device memory is filled with user input data, set it to the
  87. //! input tensor
  88. input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
  89. //! forward
  90. network->forward();
  91. network->wait();
  92. //! output is in device, should copy it to host
  93. std::shared_ptr<Tensor> output_tensor_device = network->get_io_tensor(output_name);
  94. auto output_tensor = std::make_shared<Tensor>();
  95. output_tensor->copy_from(*output_tensor_device);
  96. //! get the output data or read tensor set in network_in
  97. void* out_data = output_tensor->get_memory_ptr();
  98. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  99. output_tensor->get_layout().get_elem_size();
  100. float max = -1.0f;
  101. float sum = 0.0f;
  102. for (size_t i = 0; i < out_length; i++) {
  103. float data = static_cast<float*>(out_data)[i];
  104. sum += data;
  105. if (max < data)
  106. max = data;
  107. }
  108. printf("max=%e, sum=%e\n", max, sum);
  109. return true;
  110. }
  111. bool lite::example::pinned_host_input(const Args& args) {
  112. std::string network_path = args.model_path;
  113. std::string input_path = args.input_path;
  114. //! config the network running in CUDA device
  115. lite::Config config{LiteDeviceType::LITE_CUDA};
  116. //! create and load the network
  117. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  118. network->load_model(network_path);
  119. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  120. Layout input_layout = input_tensor->get_layout();
  121. //! read data from numpy data file
  122. auto src_tensor = parse_npy(input_path);
  123. //! malloc the pinned host memory
  124. bool is_pinned_host = true;
  125. auto tensor_pinned_input =
  126. Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
  127. //! copy to the pinned memory
  128. tensor_pinned_input.copy_from(*src_tensor);
  129. //! set the pinned host memory to the network as input
  130. input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
  131. //! forward
  132. network->forward();
  133. network->wait();
  134. //! get the output data or read tensor set in network_in
  135. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  136. void* out_data = output_tensor->get_memory_ptr();
  137. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  138. output_tensor->get_layout().get_elem_size();
  139. float max = -1.0f;
  140. float sum = 0.0f;
  141. for (size_t i = 0; i < out_length; i++) {
  142. float data = static_cast<float*>(out_data)[i];
  143. sum += data;
  144. if (max < data)
  145. max = data;
  146. }
  147. printf("max=%e, sum=%e\n", max, sum);
  148. return true;
  149. }
  150. #endif
  151. #endif
  152. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台