You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

device_io.cpp 6.2 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184
  1. #include <thread>
  2. #include "example.h"
  3. #if LITE_BUILD_WITH_MGE
  4. #include "misc.h"
  5. using namespace lite;
  6. using namespace example;
  7. #if LITE_WITH_CUDA
  8. namespace {
  9. bool device_input(const Args& args) {
  10. std::string network_path = args.model_path;
  11. std::string input_path = args.input_path;
  12. //! config the network running in CUDA device
  13. lite::Config config{LiteDeviceType::LITE_CUDA};
  14. //! set NetworkIO
  15. NetworkIO network_io;
  16. std::string input_name = "data";
  17. bool is_host = false;
  18. IO device_input{input_name, is_host};
  19. network_io.inputs.push_back(device_input);
  20. //! create and load the network
  21. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  22. network->load_model(network_path);
  23. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  24. Layout input_layout = input_tensor->get_layout();
  25. //! read data from numpy data file
  26. auto src_tensor = parse_npy(input_path);
  27. //! malloc the device memory
  28. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  29. //! copy to the device memory
  30. tensor_device.copy_from(*src_tensor);
  31. //! Now the device memory if filled with user input data, set it to the
  32. //! input tensor
  33. input_tensor->reset(tensor_device.get_memory_ptr(), input_layout);
  34. //! forward
  35. network->forward();
  36. network->wait();
  37. //! get the output data or read tensor set in network_in
  38. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  39. void* out_data = output_tensor->get_memory_ptr();
  40. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  41. output_tensor->get_layout().get_elem_size();
  42. float max = -1.0f;
  43. float sum = 0.0f;
  44. for (size_t i = 0; i < out_length; i++) {
  45. float data = static_cast<float*>(out_data)[i];
  46. sum += data;
  47. if (max < data)
  48. max = data;
  49. }
  50. printf("max=%e, sum=%e\n", max, sum);
  51. return true;
  52. }
  53. bool device_input_output(const Args& args) {
  54. std::string network_path = args.model_path;
  55. std::string input_path = args.input_path;
  56. //! config the network running in CUDA device
  57. lite::Config config{LiteDeviceType::LITE_CUDA};
  58. //! set NetworkIO include input and output
  59. NetworkIO network_io;
  60. std::string input_name = "data";
  61. std::string output_name = "TRUE_DIV(EXP[12065],reduce0[12067])[12077]";
  62. bool is_host = false;
  63. IO device_input{input_name, is_host};
  64. IO device_output{output_name, is_host};
  65. network_io.inputs.push_back(device_input);
  66. network_io.outputs.push_back(device_output);
  67. //! create and load the network
  68. std::shared_ptr<Network> network = std::make_shared<Network>(config, network_io);
  69. network->load_model(network_path);
  70. std::shared_ptr<Tensor> input_tensor_device = network->get_input_tensor(0);
  71. Layout input_layout = input_tensor_device->get_layout();
  72. //! read data from numpy data file
  73. auto src_tensor = parse_npy(input_path);
  74. //! malloc the device memory
  75. auto tensor_device = Tensor(LiteDeviceType::LITE_CUDA, input_layout);
  76. //! copy to the device memory
  77. tensor_device.copy_from(*src_tensor);
  78. //! Now the device memory is filled with user input data, set it to the
  79. //! input tensor
  80. input_tensor_device->reset(tensor_device.get_memory_ptr(), input_layout);
  81. //! forward
  82. network->forward();
  83. network->wait();
  84. //! output is in device, should copy it to host
  85. std::shared_ptr<Tensor> output_tensor_device = network->get_io_tensor(output_name);
  86. auto output_tensor = std::make_shared<Tensor>();
  87. output_tensor->copy_from(*output_tensor_device);
  88. //! get the output data or read tensor set in network_in
  89. void* out_data = output_tensor->get_memory_ptr();
  90. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  91. output_tensor->get_layout().get_elem_size();
  92. float max = -1.0f;
  93. float sum = 0.0f;
  94. for (size_t i = 0; i < out_length; i++) {
  95. float data = static_cast<float*>(out_data)[i];
  96. sum += data;
  97. if (max < data)
  98. max = data;
  99. }
  100. printf("max=%e, sum=%e\n", max, sum);
  101. return true;
  102. }
  103. bool pinned_host_input(const Args& args) {
  104. std::string network_path = args.model_path;
  105. std::string input_path = args.input_path;
  106. //! config the network running in CUDA device
  107. lite::Config config{LiteDeviceType::LITE_CUDA};
  108. //! create and load the network
  109. std::shared_ptr<Network> network = std::make_shared<Network>(config);
  110. network->load_model(network_path);
  111. std::shared_ptr<Tensor> input_tensor = network->get_input_tensor(0);
  112. Layout input_layout = input_tensor->get_layout();
  113. //! read data from numpy data file
  114. auto src_tensor = parse_npy(input_path);
  115. //! malloc the pinned host memory
  116. bool is_pinned_host = true;
  117. auto tensor_pinned_input =
  118. Tensor(LiteDeviceType::LITE_CUDA, input_layout, is_pinned_host);
  119. //! copy to the pinned memory
  120. tensor_pinned_input.copy_from(*src_tensor);
  121. //! set the pinned host memory to the network as input
  122. input_tensor->reset(tensor_pinned_input.get_memory_ptr(), input_layout);
  123. //! forward
  124. network->forward();
  125. network->wait();
  126. //! get the output data or read tensor set in network_in
  127. std::shared_ptr<Tensor> output_tensor = network->get_output_tensor(0);
  128. void* out_data = output_tensor->get_memory_ptr();
  129. size_t out_length = output_tensor->get_tensor_total_size_in_byte() /
  130. output_tensor->get_layout().get_elem_size();
  131. float max = -1.0f;
  132. float sum = 0.0f;
  133. for (size_t i = 0; i < out_length; i++) {
  134. float data = static_cast<float*>(out_data)[i];
  135. sum += data;
  136. if (max < data)
  137. max = data;
  138. }
  139. printf("max=%e, sum=%e\n", max, sum);
  140. return true;
  141. }
  142. } // namespace
  143. REGIST_EXAMPLE("device_input", device_input);
  144. REGIST_EXAMPLE("device_input_output", device_input_output);
  145. REGIST_EXAMPLE("pinned_host_input", pinned_host_input);
  146. #endif
  147. #endif
  148. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}