You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_io.cpp 6.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. /**
  2. * \file example/cpp_example/device_io.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #include <thread>
  12. #include "example.h"
  13. #if LITE_BUILD_WITH_MGE
  14. #include "misc.h"
  15. using namespace lite;
  16. using namespace example;
  17. #if LITE_WITH_CUDA
  18. namespace {
  19. bool device_input(const Args& args) {
  20. std::string network_path = args.model_path;
  21. std::string input_path = args.input_path;
  22. //! config the network running in CUDA device
  23. lite::Config config{LiteDeviceType::LITE_CUDA};
  24. //! set NetworkIO
  25. NetworkIO network_io;
  26. std::string input_name = "data";
  27. bool is_host = false;
  28. IO device_input{input_name, is_host};
  29. network_io.inputs.push_back(device_input);
  30. //! create and load the network
  31. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  32. network->load_model(network_path);
  33. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  34. Layout input_layout = input_tensor->get_layout();
  35. //! read data from numpy data file
  36. auto src_tensor = parse_npy(input_path);
  37. //! malloc the device memory
  38. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  39. //! copy to the device memory
  40. tensor_device.copy_from(*src_tensor);
  41. //! Now the device memory if filled with user input data, set it to the
  42. //! input tensor
  43. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  44. //! forward
  45. network->forward();
  46. network->wait();
  47. //! get the output data or read tensor set in network_in
  48. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  49. void* out_data = output_tensor->get_memory_ptr();
  50. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  51. output_tensor->get_layout().get_elem_size();
  52. float max = -1.0f;
  53. float sum = 0.0f;
  54. for (size_t i = 0; i < out_length; i++) {
  55. float data = static_cast<float*>(out_data)[i];
  56. sum += data;
  57. if (max < data)
  58. max = data;
  59. }
  60. printf("max=%e, sum=%e\n", max, sum);
  61. return true;
  62. }
  63. bool device_input_output(const Args& args) {
  64. std::string network_path = args.model_path;
  65. std::string input_path = args.input_path;
  66. //! config the network running in CUDA device
  67. lite::Config config{LiteDeviceType::LITE_CUDA};
  68. //! set NetworkIO include input and output
  69. NetworkIO network_io;
  70. std::string input_name = "data";
  71. std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
  72. bool is_host = false;
  73. IO device_input{input_name, is_host};
  74. IO device_output{output_name, is_host};
  75. network_io.inputs.push_back(device_input);
  76. network_io.outputs.push_back(device_output);
  77. //! create and load the network
  78. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  79. network->load_model(network_path);
  80. std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
  81. Layout input_layout = input_tensor_device->get_layout();
  82. //! read data from numpy data file
  83. auto src_tensor = parse_npy(input_path);
  84. //! malloc the device memory
  85. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  86. //! copy to the device memory
  87. tensor_device.copy_from(*src_tensor);
  88. //! Now the device memory is filled with user input data, set it to the
  89. //! input tensor
  90. input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
  91. //! forward
  92. network->forward();
  93. network->wait();
  94. //! output is in device, should copy it to host
  95. std::shared_ptr<Tensor> output_tensor_device = network->get_io_tensor(output_name);
  96. auto output_tensor = std::make_shared<Tensor>();
  97. output_tensor->copy_from(*output_tensor_device);
  98. //! get the output data or read tensor set in network_in
  99. void* out_data = output_tensor->get_memory_ptr();
  100. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  101. output_tensor->get_layout().get_elem_size();
  102. float max = -1.0f;
  103. float sum = 0.0f;
  104. for (size_t i = 0; i < out_length; i++) {
  105. float data = static_cast<float*>(out_data)[i];
  106. sum += data;
  107. if (max < data)
  108. max = data;
  109. }
  110. printf("max=%e, sum=%e\n", max, sum);
  111. return true;
  112. }
  113. bool pinned_host_input(const Args& args) {
  114. std::string network_path = args.model_path;
  115. std::string input_path = args.input_path;
  116. //! config the network running in CUDA device
  117. lite::Config config{LiteDeviceType::LITE_CUDA};
  118. //! create and load the network
  119. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  120. network->load_model(network_path);
  121. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  122. Layout input_layout = input_tensor->get_layout();
  123. //! read data from numpy data file
  124. auto src_tensor = parse_npy(input_path);
  125. //! malloc the pinned host memory
  126. bool is_pinned_host = true;
  127. auto tensor_pinned_input =
  128. Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
  129. //! copy to the pinned memory
  130. tensor_pinned_input.copy_from(*src_tensor);
  131. //! set the pinned host memory to the network as input
  132. input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
  133. //! forward
  134. network->forward();
  135. network->wait();
  136. //! get the output data or read tensor set in network_in
  137. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  138. void* out_data = output_tensor->get_memory_ptr();
  139. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  140. output_tensor->get_layout().get_elem_size();
  141. float max = -1.0f;
  142. float sum = 0.0f;
  143. for (size_t i = 0; i < out_length; i++) {
  144. float data = static_cast<float*>(out_data)[i];
  145. sum += data;
  146. if (max < data)
  147. max = data;
  148. }
  149. printf("max=%e, sum=%e\n", max, sum);
  150. return true;
  151. }
  152. } // namespace
  153. REGIST_EXAMPLE("device_input", device_input);
  154. REGIST_EXAMPLE("device_input_output", device_input_output);
  155. REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
  156. #endif
  157. #endif
  158. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}