/** * \file src/opr/impl/nvof/denseflownvidia.cpp * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") * * Copyright (c) 2014-2020 Megvii Inc. All rights reserved. * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or * implied. */ #include "megbrain_build_config.h" #if MGB_CUDA #include #include #include "megbrain/common.h" #include "denseflownvidia.h" NVFlowExtractor::NVFlowExtractor(int device_id, std::vector& shape, uint32_t preset, bool use_cuda_stream, bool debug) { batch_size = shape[0]; m_width = shape[3]; m_height = shape[2]; debug_flag = debug; m_temporal_size = shape[1]; m_use_cuda_stream = use_cuda_stream; out_width = (m_width + m_out_grid_size - 1) / m_out_grid_size; out_height = (m_height + m_out_grid_size - 1) / m_out_grid_size; m_width_in_blocks = (m_width + m_blockSizeX - 1) / m_blockSizeX; m_height_in_blocks = (m_height + m_blockSizeY - 1) / m_blockSizeY; out_size = out_width * out_height * 2; m_device_id = device_id; std::unordered_map preset_map = { {0, NV_OF_PERF_LEVEL_SLOW}, {1, NV_OF_PERF_LEVEL_MEDIUM}, {2, NV_OF_PERF_LEVEL_FAST}}; _preset = preset; auto search = preset_map.find(_preset); if (search == preset_map.end()) { mgb_throw(MegBrainError, "NVOF: invalid preset level! err type: NV_OF_ERR_INVALID_PARAM"); } perf_preset = search->second; } void NVFlowExtractor::create_nvof_instances(int height, int width) { nv_optical_flow = NvOFCuda::Create(cu_context, width, height, buffer_format, input_buffer_type, output_buffer_type, NV_OF_MODE_OPTICALFLOW, perf_preset, input_stream, output_stream); nv_optical_flow->Init(m_out_grid_size); input_buffers = nv_optical_flow->CreateBuffers( NV_OF_BUFFER_USAGE_INPUT, buffer_pool_size * batch_size); output_buffers = nv_optical_flow->CreateBuffers( NV_OF_BUFFER_USAGE_OUTPUT, (buffer_pool_size - 1) * batch_size); } void NVFlowExtractor::init_nvof_engine() { std::lock_guard lock(m_lock); if (init_flag == false) { set_device(m_device_id); if (cuCtxCreate(&cu_context, 0, cu_device)) { mgb_log_warn( "nvof: create ctx failed, fallback to get current ctx"); CUDA_DRVAPI_CALL(cuCtxGetCurrent(&cu_context)); } if (m_use_cuda_stream) { CUDA_DRVAPI_CALL(cuStreamCreate(&input_stream, CU_STREAM_DEFAULT)); CUDA_DRVAPI_CALL(cuStreamCreate(&output_stream, CU_STREAM_DEFAULT)); } create_nvof_instances(m_height, m_width); init_flag = true; } } NVFlowExtractor::~NVFlowExtractor() { if (debug_flag) { mgb_log_debug("%s: %d start", __FUNCTION__, __LINE__); } if (m_use_cuda_stream) { cuStreamDestroy(output_stream); output_stream = nullptr; cuStreamDestroy(input_stream); input_stream = nullptr; } if (debug_flag) { mgb_log_debug("%s: %d end", __FUNCTION__, __LINE__); } } void NVFlowExtractor::set_device(int dev_id) { int nGpu = 0; if (debug_flag) { mgb_log_warn("config nvof gpu device id: %d", dev_id); } CUDA_DRVAPI_CALL(cuInit(0)); CUDA_DRVAPI_CALL(cuDeviceGetCount(&nGpu)); if (dev_id < 0 || dev_id >= nGpu) { mgb_log_warn("GPU ordinal out of range. Should be with in [0, %d]", nGpu - 1); mgb_throw(MegBrainError, "NVOF: GPU Setting Error! err type: NV_OF_ERR_GENERIC"); } CUDA_DRVAPI_CALL(cuDeviceGet(&cu_device, dev_id)); } CUmemorytype NVFlowExtractor::get_mem_type(CUdeviceptr p) { unsigned int mem_type; auto ret = cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, p); if (CUDA_SUCCESS == ret) { mgb_assert( CU_MEMORYTYPE_DEVICE == mem_type || CU_MEMORYTYPE_HOST == mem_type, "only imp CU_MEMORYTYPE_HOST or CU_MEMORYTYPE_DEVICE mem type"); } else { mgb_log_warn( "nvof call cuPointerGetAttribute err!!, may init nvof opr on " "cpu comp_node, force set mem type to CU_MEMORYTYPE_HOST"); mem_type = CU_MEMORYTYPE_HOST; } return static_cast(mem_type); } void NVFlowExtractor::extract_flow(unsigned char* frames, std::vector& shape, int16_t* result_out_ptr) { auto batch_size = shape[0]; auto temporal_size = shape[1]; auto height = shape[2]; auto width = shape[3]; auto channel = shape[4]; auto temporal_len = height * width * channel; auto batch_len = temporal_size * height * width * channel; init_nvof_engine(); auto src_mem_type = get_mem_type(reinterpret_cast(frames)); auto out_mem_type = get_mem_type(reinterpret_cast(result_out_ptr)); if ((height != m_height || width != m_width) || (m_temporal_size != temporal_size)) { mgb_log_warn("We do not support dynamic shape at mgb side"); mgb_throw(MegBrainError, "NVOF: Nvof err shap!!!! err type: NV_OF_ERR_GENERIC"); } for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) { auto input_buffer_batch_offsect = buffer_pool_size * batch_idx; auto output_buffer_batch_offsect = (buffer_pool_size - 1) * batch_idx; input_buffers[input_buffer_batch_offsect]->UploadData( (unsigned char*)(frames + batch_idx * batch_len), src_mem_type); for (size_t temporal_idx = 1; temporal_idx < temporal_size; temporal_idx++) { input_buffers[input_buffer_batch_offsect + temporal_idx % buffer_pool_size] ->UploadData( (unsigned char*)(frames + batch_idx * batch_len + temporal_idx * temporal_len), src_mem_type); nv_optical_flow->Execute( input_buffers[input_buffer_batch_offsect + (temporal_idx - 1) % buffer_pool_size] .get(), input_buffers[input_buffer_batch_offsect + temporal_idx % buffer_pool_size] .get(), output_buffers[output_buffer_batch_offsect + (temporal_idx - 1) % (buffer_pool_size - 1)] .get(), nullptr, nullptr); output_buffers[output_buffer_batch_offsect + (temporal_idx - 1) % (buffer_pool_size - 1)] ->DownloadData( result_out_ptr + batch_idx * (temporal_size - 1) * out_size + (temporal_idx - 1) * out_size, out_mem_type); } } CUDA_DRVAPI_CALL(cuCtxSynchronize()); } float NVFlowExtractor::get_precision() { return m_precision; } #endif