/**
 * \file src/opr/impl/nvof/denseflownvidia.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

#include "megbrain_build_config.h"

#if MGB_CUDA
#include <mutex>
#include <vector>
#include "megbrain/common.h"
#include "denseflownvidia.h"

NVFlowExtractor::NVFlowExtractor(int device_id, std::vector<size_t>& shape,
                                 uint32_t preset, bool use_cuda_stream,
                                 bool debug) {
    batch_size = shape[0];
    m_width = shape[3];
    m_height = shape[2];
    debug_flag = debug;
    m_temporal_size = shape[1];
    m_use_cuda_stream = use_cuda_stream;
    out_width = (m_width + m_out_grid_size - 1) / m_out_grid_size;
    out_height = (m_height + m_out_grid_size - 1) / m_out_grid_size;
    m_width_in_blocks = (m_width + m_blockSizeX - 1) / m_blockSizeX;
    m_height_in_blocks = (m_height + m_blockSizeY - 1) / m_blockSizeY;
    out_size = out_width * out_height * 2;
    m_device_id = device_id;

    std::unordered_map<uint32_t, NV_OF_PERF_LEVEL> preset_map = {
            {0, NV_OF_PERF_LEVEL_SLOW},
            {1, NV_OF_PERF_LEVEL_MEDIUM},
            {2, NV_OF_PERF_LEVEL_FAST}};

    _preset = preset;
    auto search = preset_map.find(_preset);
    if (search == preset_map.end()) {
        mgb_throw(MegBrainError, "NVOF: invalid preset level! err type: NV_OF_ERR_INVALID_PARAM");
    }
    perf_preset = search->second;
}

void NVFlowExtractor::create_nvof_instances(int height, int width) {
    nv_optical_flow = NvOFCuda::Create(cu_context, width, height, buffer_format,
                                       input_buffer_type, output_buffer_type,
                                       NV_OF_MODE_OPTICALFLOW, perf_preset,
                                       input_stream, output_stream);
    nv_optical_flow->Init(m_out_grid_size);
    input_buffers = nv_optical_flow->CreateBuffers(
            NV_OF_BUFFER_USAGE_INPUT, buffer_pool_size * batch_size);
    output_buffers = nv_optical_flow->CreateBuffers(
            NV_OF_BUFFER_USAGE_OUTPUT, (buffer_pool_size - 1) * batch_size);
}

void NVFlowExtractor::init_nvof_engine() {
    std::lock_guard<std::mutex> lock(m_lock);
    if (init_flag == false) {
        set_device(m_device_id);
        if (cuCtxCreate(&cu_context, 0, cu_device)) {
            mgb_log_warn(
                    "nvof: create ctx failed, fallback to get current ctx");
            CUDA_DRVAPI_CALL(cuCtxGetCurrent(&cu_context));
        }

        if (m_use_cuda_stream) {
            CUDA_DRVAPI_CALL(cuStreamCreate(&input_stream, CU_STREAM_DEFAULT));
            CUDA_DRVAPI_CALL(cuStreamCreate(&output_stream, CU_STREAM_DEFAULT));
        }
        create_nvof_instances(m_height, m_width);
        init_flag = true;
    }
}

NVFlowExtractor::~NVFlowExtractor() {
    if (debug_flag) {
        mgb_log_debug("%s: %d start", __FUNCTION__, __LINE__);
    }

    if (m_use_cuda_stream) {
        cuStreamDestroy(output_stream);
        output_stream = nullptr;
        cuStreamDestroy(input_stream);
        input_stream = nullptr;
    }

    if (debug_flag) {
        mgb_log_debug("%s: %d end", __FUNCTION__, __LINE__);
    }
}

void NVFlowExtractor::set_device(int dev_id) {
    int nGpu = 0;

    if (debug_flag) {
        mgb_log_warn("config nvof gpu device id: %d", dev_id);
    }

    CUDA_DRVAPI_CALL(cuInit(0));
    CUDA_DRVAPI_CALL(cuDeviceGetCount(&nGpu));
    if (dev_id < 0 || dev_id >= nGpu) {
        mgb_log_warn("GPU ordinal out of range. Should be with in [0, %d]",
                     nGpu - 1);
        mgb_throw(MegBrainError, "NVOF: GPU Setting Error! err type: NV_OF_ERR_GENERIC");
    }
    CUDA_DRVAPI_CALL(cuDeviceGet(&cu_device, dev_id));
}

CUmemorytype NVFlowExtractor::get_mem_type(CUdeviceptr p) {
    unsigned int mem_type;
    auto ret = cuPointerGetAttribute(&mem_type,
                                     CU_POINTER_ATTRIBUTE_MEMORY_TYPE, p);

    if (CUDA_SUCCESS == ret) {
        mgb_assert(
                CU_MEMORYTYPE_DEVICE == mem_type ||
                        CU_MEMORYTYPE_HOST == mem_type,
                "only imp CU_MEMORYTYPE_HOST or CU_MEMORYTYPE_DEVICE mem type");
    } else {
        mgb_log_warn(
                "nvof call cuPointerGetAttribute err!!, may init nvof opr on "
                "cpu comp_node, force set mem type to CU_MEMORYTYPE_HOST");
        mem_type = CU_MEMORYTYPE_HOST;
    }

    return static_cast<CUmemorytype_enum>(mem_type);
}

void NVFlowExtractor::extract_flow(unsigned char* frames,
                                   std::vector<size_t>& shape,
                                   int16_t* result_out_ptr) {
    auto batch_size = shape[0];
    auto temporal_size = shape[1];
    auto height = shape[2];
    auto width = shape[3];
    auto channel = shape[4];
    auto temporal_len = height * width * channel;
    auto batch_len = temporal_size * height * width * channel;

    init_nvof_engine();

    auto src_mem_type = get_mem_type(reinterpret_cast<CUdeviceptr>(frames));
    auto out_mem_type =
            get_mem_type(reinterpret_cast<CUdeviceptr>(result_out_ptr));

    if ((height != m_height || width != m_width) ||
        (m_temporal_size != temporal_size)) {
        mgb_log_warn("We do not support dynamic shape at mgb side");
        mgb_throw(MegBrainError, "NVOF: Nvof err shap!!!! err type: NV_OF_ERR_GENERIC");
    }

    for (size_t batch_idx = 0; batch_idx < batch_size; batch_idx++) {
        auto input_buffer_batch_offsect = buffer_pool_size * batch_idx;
        auto output_buffer_batch_offsect = (buffer_pool_size - 1) * batch_idx;
        input_buffers[input_buffer_batch_offsect]->UploadData(
                (unsigned char*)(frames + batch_idx * batch_len), src_mem_type);

        for (size_t temporal_idx = 1; temporal_idx < temporal_size;
             temporal_idx++) {
            input_buffers[input_buffer_batch_offsect +
                          temporal_idx % buffer_pool_size]
                    ->UploadData(
                            (unsigned char*)(frames + batch_idx * batch_len +
                                             temporal_idx * temporal_len),
                            src_mem_type);

            nv_optical_flow->Execute(
                    input_buffers[input_buffer_batch_offsect +
                                  (temporal_idx - 1) % buffer_pool_size]
                            .get(),
                    input_buffers[input_buffer_batch_offsect +
                                  temporal_idx % buffer_pool_size]
                            .get(),
                    output_buffers[output_buffer_batch_offsect +
                                   (temporal_idx - 1) % (buffer_pool_size - 1)]
                            .get(),
                    nullptr, nullptr);

            output_buffers[output_buffer_batch_offsect +
                           (temporal_idx - 1) % (buffer_pool_size - 1)]
                    ->DownloadData(
                            result_out_ptr +
                                    batch_idx * (temporal_size - 1) * out_size +
                                    (temporal_idx - 1) * out_size,
                            out_mem_type);
        }
    }

    CUDA_DRVAPI_CALL(cuCtxSynchronize());
}

float NVFlowExtractor::get_precision() {
    return m_precision;
}

#endif