You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

nms_opr.cpp 10 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. #include "megbrain/opr/standalone/nms_opr.h"
  2. #if MGB_CUDA
  3. #include "./nms_kern.cuh"
  4. #endif
  5. #include "./nms_cpu.h"
  6. #include "megbrain/comp_node_env.h"
  7. #include "megbrain/serialization/sereg.h"
  8. #include "megbrain/utils/arith_helper.h" // for get_aligned_power2
  9. #if MGB_ENABLE_FBS_SERIALIZATION
  10. #include "megbrain/serialization/internal/mgb_cpp_opr_generated.h"
  11. #include "megbrain/serialization/internal/schema_generated.h"
  12. #endif
  13. using namespace mgb::opr::standalone;
  14. MGB_DYN_TYPE_OBJ_FINAL_IMPL(NMSKeep);
  15. class NMSKeep::Kern {
  16. public:
  17. virtual ~Kern() = default;
  18. //! get workspace size in bytes
  19. virtual size_t get_workspace_size(const NMSKeep* opr,
  20. const TensorShape& boxes) = 0;
  21. virtual void exec(const NMSKeep* opr, const DeviceTensorND& inp,
  22. const DeviceTensorND& out_idx,
  23. const DeviceTensorND& out_size,
  24. const DeviceTensorND& workspace) = 0;
  25. };
  26. // f{{{ cuda kernel begins
  27. #if MGB_CUDA
  28. class NMSKeep::CUDAKern final : public Kern {
  29. size_t m_workspace_overlap_mask_bytes, m_workspace_overlap_mask_bytes_align,
  30. m_workspace_rm_mask_bytes;
  31. void init(const NMSKeep* opr, const TensorShape& boxes) {
  32. auto align = opr->comp_node().get_mem_addr_alignment();
  33. size_t nr_boxes = boxes[1];
  34. m_workspace_overlap_mask_bytes =
  35. nr_boxes * DIVUP(nr_boxes, 64) * sizeof(uint64_t);
  36. m_workspace_overlap_mask_bytes_align =
  37. get_aligned_power2(m_workspace_overlap_mask_bytes, align);
  38. m_workspace_rm_mask_bytes = DIVUP(nr_boxes, 64) * sizeof(uint64_t);
  39. }
  40. public:
  41. size_t get_workspace_size(const NMSKeep* opr,
  42. const TensorShape& boxes) override {
  43. init(opr, boxes);
  44. return m_workspace_overlap_mask_bytes_align + m_workspace_rm_mask_bytes;
  45. }
  46. void exec(const NMSKeep* opr, const DeviceTensorND& inp,
  47. const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
  48. const DeviceTensorND& workspace) override;
  49. };
  50. void NMSKeep::CUDAKern::exec(const NMSKeep* opr, const DeviceTensorND& inp,
  51. const DeviceTensorND& out_idx,
  52. const DeviceTensorND& out_size,
  53. const DeviceTensorND& workspace) {
  54. // NOTE: input comp node might be different from output comp node (for
  55. // example, CUDA stream may be modified to overlap computations); a
  56. // SingleCNOperatorNodeBase is expected to execute on a single comp node,
  57. // and the comp node is defined as the output comp node
  58. CompNode comp_node = out_idx.comp_node();
  59. // comp ndoe is also accessible from SingleCNOperatorNode
  60. mgb_assert(comp_node == opr->comp_node());
  61. // CompNodeEnv contains platform-specific properties of a CompNode
  62. auto&& cuda_env = CompNodeEnv::from_comp_node(comp_node).cuda_env();
  63. mgb_assert(cuda_env.device_prop.warpSize == 32, "invalid warp size: %d",
  64. cuda_env.device_prop.warpSize);
  65. auto stream = cuda_env.stream;
  66. init(opr, inp.shape());
  67. auto inp_ptr = inp.ptr<float>();
  68. void* workspace_ptr = workspace.raw_ptr();
  69. auto dev_overlap_mask = reinterpret_cast<uint64_t*>(workspace_ptr),
  70. dev_rm_mask = (uint64_t*)(
  71. workspace.raw_ptr() + m_workspace_overlap_mask_bytes_align);
  72. auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()),
  73. out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
  74. size_t batch = inp.shape(0), nr_boxes = inp.shape(1);
  75. MGB_CUDA_CHECK(cudaMemsetAsync(dev_overlap_mask, 0,
  76. m_workspace_overlap_mask_bytes, stream));
  77. auto max_output = opr->param().max_output;
  78. for (size_t i = 0; i < batch; ++i) {
  79. nms::launch_gen_mask(nr_boxes, opr->param().iou_thresh,
  80. inp_ptr + i * nr_boxes * 4, DIVUP(nr_boxes, 64),
  81. dev_overlap_mask, stream);
  82. MGB_CUDA_CHECK(cudaMemsetAsync(dev_rm_mask, 0,
  83. m_workspace_rm_mask_bytes, stream));
  84. nms::launch_gen_indices(nr_boxes, max_output, DIVUP(nr_boxes, 64),
  85. dev_overlap_mask, dev_rm_mask,
  86. out_idx_ptr + i * max_output, out_size_ptr + i,
  87. stream);
  88. }
  89. }
  90. #endif // MGB_CUDA for CUDAKern
  91. // f}}} cuda kernel ends
  92. // f{{{ cpu kernel begins
  93. class NMSKeep::CPUKern final : public Kern {
  94. public:
  95. ~CPUKern() = default;
  96. size_t get_workspace_size(const NMSKeep*,
  97. const TensorShape& boxes) override {
  98. return nms::cpu_kern_workspace(boxes.shape[1]);
  99. }
  100. void exec(const NMSKeep* opr, const DeviceTensorND& inp,
  101. const DeviceTensorND& out_idx, const DeviceTensorND& out_size,
  102. const DeviceTensorND& workspace) override;
  103. };
  104. void NMSKeep::CPUKern::exec(const NMSKeep* opr, const DeviceTensorND& inp,
  105. const DeviceTensorND& out_idx,
  106. const DeviceTensorND& out_size,
  107. const DeviceTensorND& workspace) {
  108. // See CUDAKern::exec for more explanation on output comp nodes.
  109. CompNode comp_node = out_idx.comp_node();
  110. auto inp_ptr = inp.ptr<float>();
  111. auto out_idx_ptr = reinterpret_cast<uint32_t*>(out_idx.ptr<int32_t>()),
  112. out_size_ptr = reinterpret_cast<uint32_t*>(out_size.ptr<int32_t>());
  113. size_t batch = inp.shape(0), nr_boxes = inp.shape(1);
  114. auto param = opr->param();
  115. auto workspace_ptr = workspace.raw_ptr();
  116. // NOTE: we must copy all the params into the kernel closure since it would
  117. // be dispatched on a different thread
  118. auto kern = [=]() {
  119. for (size_t i = 0; i < batch; ++i) {
  120. nms::cpu_kern(nr_boxes, param.max_output, param.iou_thresh,
  121. inp_ptr + i * nr_boxes * 4,
  122. out_idx_ptr + i * param.max_output, out_size_ptr + i,
  123. workspace_ptr);
  124. }
  125. };
  126. // The kernel should not be invoked
  127. CompNodeEnv::from_comp_node(comp_node).cpu_env().dispatch(kern);
  128. }
  129. // f}}} cpu kernel ends
  130. NMSKeep::NMSKeep(VarNode* boxes, const Param& param,
  131. const OperatorNodeConfig& config)
  132. : Super(boxes->owner_graph(), // owner graph
  133. config, // OperatorNodeConfig
  134. "nms_keep", // opr type name (used for generating opr name)
  135. {boxes} // input vars for generating opr name
  136. ),
  137. m_param{param} {
  138. mgb_assert(boxes->dtype() == dtype::Float32(),
  139. "input should be float32; got %s", boxes->dtype().name());
  140. // setup m_kern according to device type
  141. switch (boxes->comp_node().device_type()) {
  142. #if MGB_CUDA
  143. case CompNode::DeviceType::CUDA:
  144. m_kern = std::make_unique<CUDAKern>();
  145. break;
  146. #endif
  147. case CompNode::DeviceType::CPU:
  148. m_kern = std::make_unique<CPUKern>();
  149. break;
  150. default:
  151. mgb_throw(MegBrainError, "NMSKeep: unsupported device type: %s",
  152. boxes->comp_node().to_string().c_str());
  153. }
  154. add_input({boxes});
  155. add_output("indices")->dtype(dtype::Int32());
  156. add_output("sizes")->dtype(dtype::Int32());
  157. cg::add_workspace_output(this); // workspace is also an output var
  158. // make the graph deduplication system consider m_param (so two oprs with
  159. // same input vars but different param values would not be deduplicated)
  160. add_equivalence_component<PODHash<Param>>(&m_param);
  161. }
  162. // impl dtor after Kern is defined
  163. NMSKeep::~NMSKeep() noexcept = default;
  164. mgb::SymbolVar NMSKeep::make(SymbolVar boxes, const Param& param,
  165. const OperatorNodeConfig& config) {
  166. // SymbolVar is just a wrapper of VarNode*, with overloaded methods such as
  167. // operator+()
  168. auto bvar = boxes.node();
  169. // insert opr into the owner graph of boxes
  170. return boxes.insert_single_output_opr<NMSKeep>(bvar, param, config);
  171. }
  172. void NMSKeep::get_output_var_shape(const TensorShapeArray& inp_shape,
  173. TensorShapeArray& out_shape) const {
  174. auto boxes = inp_shape.at(0);
  175. mgb_assert(boxes.ndim == 3 && boxes.shape[2] == 4, "invalid box shape: %s",
  176. boxes.to_string().c_str());
  177. // out_shape should match the outputs added in the constructor
  178. mgb_assert(out_shape.size() == 3);
  179. auto batch = boxes[0];
  180. out_shape[0] = {batch, m_param.max_output}; // indices
  181. out_shape[1] = {batch}; // sizes
  182. out_shape[2] = {m_kern->get_workspace_size(this, boxes)}; // workspace
  183. }
  184. void NMSKeep::add_input_layout_constraint() {
  185. input(0)->add_layout_constraint_contiguous();
  186. }
  187. void NMSKeep::scn_do_execute() {
  188. DeviceTensorND empty_workspace;
  189. m_kern->exec(this, input(0)->dev_tensor(), output(0)->dev_tensor(),
  190. output(1)->dev_tensor(),
  191. // if workspace size is 0, output(2) would be invalid and its
  192. // dev_tensor() can not be accessed
  193. output(2)->dev_tensor_valid() ? output(2)->dev_tensor()
  194. : empty_workspace);
  195. }
  196. #if MGB_ENABLE_FBS_SERIALIZATION
  197. namespace mgb {
  198. namespace serialization {
  199. namespace fbs {
  200. template <>
  201. struct ParamConverter<opr::standalone::NMSKeep::Param> {
  202. using FlatBufferType = param::NMSKeep;
  203. static opr::standalone::NMSKeep::Param to_param(const FlatBufferType* fb) {
  204. return {fb->iou_thresh(), fb->max_output()};
  205. }
  206. static flatbuffers::Offset<FlatBufferType> to_flatbuffer(
  207. flatbuffers::FlatBufferBuilder& builder,
  208. const opr::standalone::NMSKeep::Param& p) {
  209. return param::CreateNMSKeep(builder, p.iou_thresh, p.max_output);
  210. }
  211. };
  212. } // namespace fbs
  213. } // namespace serialization
  214. } // namespace mgb
  215. #endif
  216. namespace mgb {
  217. void _hack_pull_in_nms_opr_object() {}
  218. } // namespace mgb
  219. // register serialization: the default implementation uses Opr::Param; it
  220. // requires Param::TAG, Opr::param() and Opr::make(..., param) to exist
  221. // Note: the second param 1 here means that this operator has one input
  222. using NMSKeepMGB = NMSKeep;
  223. MGB_SEREG_OPR(NMSKeepMGB, 1);
  224. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台