You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

denseflownvidia.cpp 7.6 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202
  1. /**
  2. * \file src/opr/impl/nvof/denseflownvidia.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain_build_config.h"
  13. #if MGB_CUDA
  14. #include <mutex>
  15. #include <vector>
  16. #include "megbrain/common.h"
  17. #include "denseflownvidia.h"
  18. NVFlowExtractor::NVFlowExtractor(int device_id, std::vector<size_t>& shape,
  19. uint32_t preset, bool use_cuda_stream,
  20. bool debug) {
  21. batch_size = shape[0];
  22. m_width = shape[3];
  23. m_height = shape[2];
  24. debug_flag = debug;
  25. m_temporal_size = shape[1];
  26. m_use_cuda_stream = use_cuda_stream;
  27. out_width = (m_width + m_out_grid_size - 1) / m_out_grid_size;
  28. out_height = (m_height + m_out_grid_size - 1) / m_out_grid_size;
  29. m_width_in_blocks = (m_width + m_blockSizeX - 1) / m_blockSizeX;
  30. m_height_in_blocks = (m_height + m_blockSizeY - 1) / m_blockSizeY;
  31. out_size = out_width * out_height * 2;
  32. m_device_id = device_id;
  33. std::unordered_map<uint32_t, NV_OF_PERF_LEVEL> preset_map = {
  34. {0, NV_OF_PERF_LEVEL_SLOW},
  35. {1, NV_OF_PERF_LEVEL_MEDIUM},
  36. {2, NV_OF_PERF_LEVEL_FAST}};
  37. _preset = preset;
  38. auto search = preset_map.find(_preset);
  39. if (search == preset_map.end()) {
  40. mgb_throw(MegBrainError, "NVOF: invalid preset level! err type: NV_OF_ERR_INVALID_PARAM");
  41. }
  42. perf_preset = search->second;
  43. }
  44. void NVFlowExtractor::create_nvof_instances(int height, int width) {
  45. nv_optical_flow = NvOFCuda::Create(cu_context, width, height, buffer_format,
  46. input_buffer_type, output_buffer_type,
  47. NV_OF_MODE_OPTICALFLOW, perf_preset,
  48. input_stream, output_stream);
  49. nv_optical_flow->Init(m_out_grid_size);
  50. input_buffers = nv_optical_flow->CreateBuffers(
  51. NV_OF_BUFFER_USAGE_INPUT, buffer_pool_size * batch_size);
  52. output_buffers = nv_optical_flow->CreateBuffers(
  53. NV_OF_BUFFER_USAGE_OUTPUT, (buffer_pool_size - 1) * batch_size);
  54. }
  55. void NVFlowExtractor::init_nvof_engine() {
  56. std::lock_guard<std::mutex> lock(m_lock);
  57. if (init_flag == false) {
  58. set_device(m_device_id);
  59. if (cuCtxCreate(&cu_context, 0, cu_device)) {
  60. mgb_log_warn(
  61. "nvof: create ctx failed, fallback to get current ctx");
  62. CUDA_DRVAPI_CALL(cuCtxGetCurrent(&cu_context));
  63. }
  64. if (m_use_cuda_stream) {
  65. CUDA_DRVAPI_CALL(cuStreamCreate(&input_stream, CU_STREAM_DEFAULT));
  66. CUDA_DRVAPI_CALL(cuStreamCreate(&output_stream, CU_STREAM_DEFAULT));
  67. }
  68. create_nvof_instances(m_height, m_width);
  69. init_flag = true;
  70. }
  71. }
  72. NVFlowExtractor::~NVFlowExtractor() {
  73. if (debug_flag) {
  74. mgb_log_debug("%s: %d start", __FUNCTION__, __LINE__);
  75. }
  76. if (m_use_cuda_stream) {
  77. cuStreamDestroy(output_stream);
  78. output_stream = nullptr;
  79. cuStreamDestroy(input_stream);
  80. input_stream = nullptr;
  81. }
  82. if (debug_flag) {
  83. mgb_log_debug("%s: %d end", __FUNCTION__, __LINE__);
  84. }
  85. }
  86. void NVFlowExtractor::set_device(int dev_id) {
  87. int nGpu = 0;
  88. if (debug_flag) {
  89. mgb_log_warn("config nvof gpu device id: %d", dev_id);
  90. }
  91. CUDA_DRVAPI_CALL(cuInit(0));
  92. CUDA_DRVAPI_CALL(cuDeviceGetCount(&nGpu));
  93. if (dev_id < 0 || dev_id >= nGpu) {
  94. mgb_log_warn("GPU ordinal out of range. Should be with in [0, %d]",
  95. nGpu - 1);
  96. mgb_throw(MegBrainError, "NVOF: GPU Setting Error! err type: NV_OF_ERR_GENERIC");
  97. }
  98. CUDA_DRVAPI_CALL(cuDeviceGet(&cu_device, dev_id));
  99. }
  100. CUmemorytype NVFlowExtractor::get_mem_type(CUdeviceptr p) {
  101. unsigned int mem_type;
  102. auto ret = cuPointerGetAttribute(&mem_type,
  103. CU_POINTER_ATTRIBUTE_MEMORY_TYPE, p);
  104. if (CUDA_SUCCESS == ret) {
  105. mgb_assert(
  106. CU_MEMORYTYPE_DEVICE == mem_type ||
  107. CU_MEMORYTYPE_HOST == mem_type,
  108. "only imp CU_MEMORYTYPE_HOST or CU_MEMORYTYPE_DEVICE mem type");
  109. } else {
  110. mgb_log_warn(
  111. "nvof call cuPointerGetAttribute err!!, may init nvof opr on "
  112. "cpu comp_node, force set mem type to CU_MEMORYTYPE_HOST");
  113. mem_type = CU_MEMORYTYPE_HOST;
  114. }
  115. return static_cast<CUmemorytype_enum>(mem_type);
  116. }
  117. void NVFlowExtractor::extract_flow(unsigned char* frames,
  118. std::vector<size_t>& shape,
  119. int16_t* result_out_ptr) {
  120. auto batch_size = shape[0];
  121. auto temporal_size = shape[1];
  122. auto height = shape[2];
  123. auto width = shape[3];
  124. auto channel = shape[4];
  125. auto temporal_len = height * width * channel;
  126. auto batch_len = temporal_size * height * width * channel;
  127. init_nvof_engine();
  128. auto src_mem_type = get_mem_type(reinterpret_cast<CUdeviceptr>(frames));
  129. auto out_mem_type =
  130. get_mem_type(reinterpret_cast<CUdeviceptr>(result_out_ptr));
  131. if ((height != m_height || width != m_width) ||
  132. (m_temporal_size != temporal_size)) {
  133. mgb_log_warn("We do not support dynamic shape at mgb side");
  134. mgb_throw(MegBrainError, "NVOF: Nvof err shap!!!! err type: NV_OF_ERR_GENERIC");
  135. }
  136. for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) {
  137. auto input_buffer_batch_offsect = buffer_pool_size * batch_idx;
  138. auto output_buffer_batch_offsect = (buffer_pool_size - 1) * batch_idx;
  139. input_buffers[input_buffer_batch_offsect]->UploadData(
  140. (unsigned char*)(frames + batch_idx * batch_len), src_mem_type);
  141. for (size_t temporal_idx = 1; temporal_idx < temporal_size;
  142. temporal_idx++) {
  143. input_buffers[input_buffer_batch_offsect +
  144. temporal_idx % buffer_pool_size]
  145. ->UploadData(
  146. (unsigned char*)(frames + batch_idx * batch_len +
  147. temporal_idx * temporal_len),
  148. src_mem_type);
  149. nv_optical_flow->Execute(
  150. input_buffers[input_buffer_batch_offsect +
  151. (temporal_idx - 1) % buffer_pool_size]
  152. .get(),
  153. input_buffers[input_buffer_batch_offsect +
  154. temporal_idx % buffer_pool_size]
  155. .get(),
  156. output_buffers[output_buffer_batch_offsect +
  157. (temporal_idx - 1) % (buffer_pool_size - 1)]
  158. .get(),
  159. nullptr, nullptr);
  160. output_buffers[output_buffer_batch_offsect +
  161. (temporal_idx - 1) % (buffer_pool_size - 1)]
  162. ->DownloadData(
  163. result_out_ptr +
  164. batch_idx * (temporal_size - 1) * out_size +
  165. (temporal_idx - 1) * out_size,
  166. out_mem_type);
  167. }
  168. }
  169. CUDA_DRVAPI_CALL(cuCtxSynchronize());
  170. }
  171. float NVFlowExtractor::get_precision() {
  172. return m_precision;
  173. }
  174. #endif

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台