You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_io.cpp 6.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. #include <thread>
  2. #include "example.h"
  3. #if LITE_BUILD_WITH_MGE
  4. using namespace lite;
  5. using namespace example;
  6. #if LITE_WITH_CUDA
  7. namespace {
  8. bool device_input(const Args& args) {
  9. std::string network_path = args.model_path;
  10. std::string input_path = args.input_path;
  11. //! config the network running in CUDA device
  12. lite::Config config{LiteDeviceType::LITE_CUDA};
  13. //! set NetworkIO
  14. NetworkIO network_io;
  15. std::string input_name = "data";
  16. bool is_host = false;
  17. IO device_input{input_name, is_host};
  18. network_io.inputs.push_back(device_input);
  19. //! create and load the network
  20. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  21. network->load_model(network_path);
  22. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  23. Layout input_layout = input_tensor->get_layout();
  24. //! read data from numpy data file
  25. auto src_tensor = parse_npy(input_path);
  26. //! malloc the device memory
  27. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  28. //! copy to the device memory
  29. tensor_device.copy_from(*src_tensor);
  30. //! Now the device memory if filled with user input data, set it to the
  31. //! input tensor
  32. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  33. //! forward
  34. network->forward();
  35. network->wait();
  36. //! get the output data or read tensor set in network_in
  37. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  38. void* out_data = output_tensor->get_memory_ptr();
  39. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  40. output_tensor->get_layout().get_elem_size();
  41. float max = -1.0f;
  42. float sum = 0.0f;
  43. for (size_t i = 0; i < out_length; i++) {
  44. float data = static_cast<float*>(out_data)[i];
  45. sum += data;
  46. if (max < data)
  47. max = data;
  48. }
  49. printf("max=%e, sum=%e\n", max, sum);
  50. return true;
  51. }
  52. bool device_input_output(const Args& args) {
  53. std::string network_path = args.model_path;
  54. std::string input_path = args.input_path;
  55. //! config the network running in CUDA device
  56. lite::Config config{LiteDeviceType::LITE_CUDA};
  57. //! set NetworkIO include input and output
  58. NetworkIO network_io;
  59. std::string input_name = "data";
  60. std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
  61. bool is_host = false;
  62. IO device_input{input_name, is_host};
  63. IO device_output{output_name, is_host};
  64. network_io.inputs.push_back(device_input);
  65. network_io.outputs.push_back(device_output);
  66. //! create and load the network
  67. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  68. network->load_model(network_path);
  69. std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
  70. Layout input_layout = input_tensor_device->get_layout();
  71. //! read data from numpy data file
  72. auto src_tensor = parse_npy(input_path);
  73. //! malloc the device memory
  74. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  75. //! copy to the device memory
  76. tensor_device.copy_from(*src_tensor);
  77. //! Now the device memory is filled with user input data, set it to the
  78. //! input tensor
  79. input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
  80. //! forward
  81. network->forward();
  82. network->wait();
  83. //! output is in device, should copy it to host
  84. std::shared_ptr<Tensor> output_tensor_device = network->get_io_tensor(output_name);
  85. auto output_tensor = std::make_shared<Tensor>();
  86. output_tensor->copy_from(*output_tensor_device);
  87. //! get the output data or read tensor set in network_in
  88. void* out_data = output_tensor->get_memory_ptr();
  89. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  90. output_tensor->get_layout().get_elem_size();
  91. float max = -1.0f;
  92. float sum = 0.0f;
  93. for (size_t i = 0; i < out_length; i++) {
  94. float data = static_cast<float*>(out_data)[i];
  95. sum += data;
  96. if (max < data)
  97. max = data;
  98. }
  99. printf("max=%e, sum=%e\n", max, sum);
  100. return true;
  101. }
  102. bool pinned_host_input(const Args& args) {
  103. std::string network_path = args.model_path;
  104. std::string input_path = args.input_path;
  105. //! config the network running in CUDA device
  106. lite::Config config{LiteDeviceType::LITE_CUDA};
  107. //! create and load the network
  108. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  109. network->load_model(network_path);
  110. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  111. Layout input_layout = input_tensor->get_layout();
  112. //! read data from numpy data file
  113. auto src_tensor = parse_npy(input_path);
  114. //! malloc the pinned host memory
  115. bool is_pinned_host = true;
  116. auto tensor_pinned_input =
  117. Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
  118. //! copy to the pinned memory
  119. tensor_pinned_input.copy_from(*src_tensor);
  120. //! set the pinned host memory to the network as input
  121. input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
  122. //! forward
  123. network->forward();
  124. network->wait();
  125. //! get the output data or read tensor set in network_in
  126. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  127. void* out_data = output_tensor->get_memory_ptr();
  128. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  129. output_tensor->get_layout().get_elem_size();
  130. float max = -1.0f;
  131. float sum = 0.0f;
  132. for (size_t i = 0; i < out_length; i++) {
  133. float data = static_cast<float*>(out_data)[i];
  134. sum += data;
  135. if (max < data)
  136. max = data;
  137. }
  138. printf("max=%e, sum=%e\n", max, sum);
  139. return true;
  140. }
  141. } // namespace
  142. REGIST_EXAMPLE("device_input", device_input);
  143. REGIST_EXAMPLE("device_input_output", device_input_output);
  144. REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
  145. #endif
  146. #endif
  147. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}