/**
 * \file dnn/src/cuda/convolution3d/opr_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 */

#include "./opr_impl.h"
#include "./backward_data/algo.h"
#include "./backward_filter/algo.h"
#include "./forward/algo.h"
#include "./helper.h"

#include "src/common/algo_chooser.h"
#include "src/cuda/utils.h"

using namespace megdnn;
using namespace cuda;
using namespace convolution3d;

#define TO_STRING2(v) #v
#define TO_STRING(v) TO_STRING2(v)
#define CUDNN_VERSION_STR  \
    TO_STRING(CUDNN_MAJOR) \
    "." TO_STRING(CUDNN_MINOR) "." TO_STRING(CUDNN_PATCHLEVEL)

/* ============== Convolution3DForwardImpl ============== */
Convolution3DForwardImpl::Algorithm*
Convolution3DForwardImpl::get_algorithm_heuristic(
        const TensorLayout& src, const TensorLayout& filter,
        const TensorLayout& dst, size_t workspace_limit_in_bytes,
        const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    auto fm = check_layout_fwd(src, filter, dst);
    return get_algorithm_heuristic(src, fm, dst, workspace_limit_in_bytes,
                                   positive_attr, negative_attr);
}
Convolution3DForwardImpl::Algorithm*
Convolution3DForwardImpl::get_algorithm_heuristic(
        const TensorLayout& src, const CanonizedFilterMeta& filter,
        const TensorLayout& dst, size_t workspace_limit_in_bytes,
        const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    AlgoBase::SizeArgs args(this, src, filter, dst);

#if CUDNN_MAJOR < 7 || (CUDNN_MAJOR == 7 && CUDNN_MINOR < 5)
    if (args.filter_meta.group > 1) {
        // prefer special chanwise impl since as the group conv of cudnn whose
        // version is lower than v7.5.0 is still slower than our implementation
        // in many channel-wise cases
        if (sm_algo_pack.chanwise.is_available_attribute(
                    args, positive_attr, negative_attr,
                    workspace_limit_in_bytes)) {
            return &sm_algo_pack.chanwise;
        }
    }
#endif

    auto prefer_1x1x1 = [&args, positive_attr, negative_attr,
                         workspace_limit_in_bytes]() {
        const size_t MAX_BATCH_SIZE_FOR_1x1x1_MAT_ALGO = 4;
        size_t batch_size = args.src_layout->shape[0];
        if (batch_size > MAX_BATCH_SIZE_FOR_1x1x1_MAT_ALGO) {
            return false;
        }
        return sm_algo_pack.a1x1x1.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes);
    };

    auto get_cudnn_algo =
            [this, &args, workspace_limit_in_bytes, positive_attr,
             negative_attr]() -> Convolution3DForwardImpl::AlgoBase* {
        auto cudnn_handle = cuda::cudnn_handle(this->handle());
        cudnnConvolutionFwdAlgo_t algo;
        CUDNNForwardDescs desc;
        args.init_desc(desc);

        bool got = cudnn_get_convolution_fwd_algo_helper(
                cudnn_handle, desc.src_desc.desc, desc.filter_desc.desc,
                desc.conv_desc.desc, desc.dst_desc.desc,
                workspace_limit_in_bytes, &algo, positive_attr, negative_attr);
        if (got) {
            return static_cast<AlgoBase*>(
                    megdnn::get_algo_match_attribute<Convolution3DForwardImpl>(
                            sm_algo_pack.cudnn_from_enum(algo), positive_attr,
                            negative_attr));
        } else {
            return nullptr;
        }
    };
    if (prefer_1x1x1()) {
        return &sm_algo_pack.a1x1x1;
    }
    if (is_cudnn_supported(args)) {
        if (auto algo = get_cudnn_algo())
            return algo;
    }
    if (args.filter_meta.group > 1) {
        auto orig_args = args;
        TensorLayout a, b;
        AlgoGroupConvGeneral::modify_size_args(args, a, b);
        if (prefer_1x1x1()) {
            return sm_algo_pack.algo2gconv.at(&sm_algo_pack.a1x1x1);
        }
        if (is_cudnn_supported(args)) {
            if (auto algo = get_cudnn_algo())
                return sm_algo_pack.algo2gconv.at(algo);
        }
        args = orig_args;
    }

    return megdnn::get_algo_match_attribute<Convolution3DForwardImpl>(
            sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
            "cuda conv3d fwd", positive_attr, negative_attr);
}

std::vector<Convolution3DForwardImpl::Algorithm*>
Convolution3DForwardImpl::get_all_algorithms(const TensorLayout& src,
                                             const TensorLayout& filter,
                                             const TensorLayout& dst) {
    return megdnn::get_all_algorithms<Convolution3DForwardImpl>(
            {this, src, filter, dst});
}

size_t Convolution3DForwardImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& filter,
        const TensorLayout& dst) {
    AlgoBase::SizeArgs args(this, src, filter, dst);
    return get_algorithm(this, src, args.filter_meta, dst)
            ->get_workspace_in_bytes(args);
}

void Convolution3DForwardImpl::exec(_megdnn_tensor_in src,
                                    _megdnn_tensor_in filter,
                                    _megdnn_tensor_out dst,
                                    _megdnn_workspace workspace) {
    AlgoBase::ExecArgs args(this, src, filter, dst, workspace);
    auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout);
    algo->check_workspace(args, workspace).exec(args);
}

const char* Convolution3DForwardImpl::get_algorithm_set_name() const {
    return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
}

void Convolution3DBackwardDataImpl::exec(_megdnn_tensor_in filter,
                                         _megdnn_tensor_in diff,
                                         _megdnn_tensor_out grad,
                                         _megdnn_workspace workspace) {
    AlgoBase::ExecArgs args(this, filter, diff, grad, workspace);
    auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout);
    algo->check_workspace(args, workspace).exec(args);
}

std::vector<Convolution3DBackwardDataImpl::Algorithm*>
Convolution3DBackwardDataImpl::get_all_algorithms(const TensorLayout& filter,
                                                  const TensorLayout& diff,
                                                  const TensorLayout& grad) {
    return megdnn::get_all_algorithms<Convolution3DBackwardDataImpl>(
            {this, filter, diff, grad});
}

Convolution3DBackwardDataImpl::Algorithm*
Convolution3DBackwardDataImpl::get_algorithm_heuristic(
        const TensorLayout& filter, const TensorLayout& diff,
        const TensorLayout& grad, size_t workspace_limit_in_bytes,
        const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    auto fm = check_layout_fwd(grad, filter, diff);
    return get_algorithm_heuristic(fm, diff, grad, workspace_limit_in_bytes,
                                   positive_attr, negative_attr);
}

Convolution3DBackwardDataImpl::Algorithm*
Convolution3DBackwardDataImpl::get_algorithm_heuristic(
        const CanonizedFilterMeta& filter, const TensorLayout& diff,
        const TensorLayout& grad, size_t workspace_limit_in_bytes,
        const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    AlgoBase::SizeArgs args(this, filter, diff, grad);

    if (args.filter_meta.group > 1 &&
        sm_algo_pack.chanwise.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
        return &sm_algo_pack.chanwise;
    }

    auto get_cudnn_algo =
            [this, &args, workspace_limit_in_bytes, positive_attr,
             negative_attr]() -> Convolution3DBackwardDataImpl::AlgoBase* {
        auto cudnn_handle = cuda::cudnn_handle(this->handle());
        cudnnConvolutionBwdDataAlgo_t algo;
        CUDNNBwdDataDescs desc;
        args.init_desc(desc);
        bool got = cudnn_get_convolution_bwd_data_algo_helper(
                cudnn_handle, desc.filter_desc.desc, desc.diff_desc.desc,
                desc.conv_desc.desc, desc.grad_desc.desc,
                workspace_limit_in_bytes, &algo, positive_attr, negative_attr);
        if (got) {
            return static_cast<AlgoBase*>(megdnn::get_algo_match_attribute<
                                          Convolution3DBackwardDataImpl>(
                    sm_algo_pack.cudnn_from_enum(algo), positive_attr,
                    negative_attr));
        } else {
            return nullptr;
        }
    };

    if (is_cudnn_supported(args.as_fwd_args())) {
        if (auto algo = get_cudnn_algo())
            return algo;
    }

    if (args.filter_meta.group > 1) {
        auto orig_args = args;
        TensorLayout a, b;
        AlgoGroupConvGeneral::modify_size_args(args, a, b);
        if (is_cudnn_supported(args.as_fwd_args())) {
            if (auto algo = get_cudnn_algo())
                return sm_algo_pack.algo2gconv.at(algo);
        }
        args = orig_args;
    }

    return megdnn::get_algo_match_attribute<Convolution3DBackwardDataImpl>(
            sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
            "cuda conv3d bwd data", positive_attr, negative_attr);
}

size_t Convolution3DBackwardDataImpl::get_workspace_in_bytes(
        const TensorLayout& filter, const TensorLayout& diff,
        const TensorLayout& grad) {
    AlgoBase::SizeArgs args(this, filter, diff, grad);
    return get_algorithm(this, args.filter_meta, diff, grad)
            ->get_workspace_in_bytes(args);
}

const char* Convolution3DBackwardDataImpl::get_algorithm_set_name() const {
    return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
}

void Convolution3DBackwardFilterImpl::exec(_megdnn_tensor_in src,
                                           _megdnn_tensor_in diff,
                                           _megdnn_tensor_out grad,
                                           _megdnn_workspace workspace) {
    AlgoBase::ExecArgs args(this, src, diff, grad, workspace);
    auto algo =
            get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta);
    algo->check_workspace(args, workspace).exec(args);
}

std::vector<Convolution3DBackwardFilterImpl::Algorithm*>
Convolution3DBackwardFilterImpl::get_all_algorithms(const TensorLayout& src,
                                                    const TensorLayout& diff,
                                                    const TensorLayout& grad) {
    return megdnn::get_all_algorithms<Convolution3DBackwardFilterImpl>(
            {this, src, diff, grad});
}

Convolution3DBackwardFilterImpl::Algorithm*
Convolution3DBackwardFilterImpl::get_algorithm_heuristic(
        const TensorLayout& src, const TensorLayout& diff,
        const TensorLayout& grad, size_t workspace_limit_in_bytes,
        const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    auto fm = check_layout_fwd(src, grad, diff);
    return get_algorithm_heuristic(src, diff, fm, workspace_limit_in_bytes,
                                   positive_attr, negative_attr);
}

Convolution3DBackwardFilterImpl::Algorithm*
Convolution3DBackwardFilterImpl::get_algorithm_heuristic(
        const TensorLayout& src, const TensorLayout& diff,
        const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes,
        const AlgoAttribute& positive_attr,
        const AlgoAttribute& negative_attr) {
    AlgoBase::SizeArgs args(this, src, diff, grad);

    if (args.grad_filter_meta.group > 1 &&
        sm_algo_pack.chanwise.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
        return &sm_algo_pack.chanwise;
    }

    auto get_cudnn_algo =
            [this, &args, workspace_limit_in_bytes, positive_attr,
             negative_attr]() -> Convolution3DBackwardFilterImpl::AlgoBase* {
        auto cudnn_handle = cuda::cudnn_handle(this->handle());
        cudnnConvolutionBwdFilterAlgo_t algo;
        CUDNNBwdFilterDescs desc;
        args.init_desc(desc);
        bool got = cudnn_get_convolution_bwd_filter_algo_helper(
                cudnn_handle, desc.src_desc.desc, desc.diff_desc.desc,
                desc.conv_desc.desc, desc.grad_desc.desc,
                workspace_limit_in_bytes, &algo, positive_attr, negative_attr);
        if (got) {
            return static_cast<AlgoBase*>(megdnn::get_algo_match_attribute<
                                          Convolution3DBackwardFilterImpl>(
                    sm_algo_pack.cudnn_from_enum(algo), positive_attr,
                    negative_attr));
        } else {
            return nullptr;
        }
    };

    if (is_cudnn_supported(args.as_fwd_args())) {
        if (auto algo = get_cudnn_algo())
            return algo;
    }
    if (args.grad_filter_meta.group > 1) {
        auto orig_args = args;
        TensorLayout a, b;
        AlgoGroupConvGeneral::modify_size_args(args, a, b);
        if (is_cudnn_supported(args.as_fwd_args())) {
            if (auto algo = get_cudnn_algo())
                return sm_algo_pack.algo2gconv.at(algo);
        }
        args = orig_args;
    }

    return megdnn::get_algo_match_attribute<Convolution3DBackwardFilterImpl>(
            sm_algo_pack.non_cudnn_algos, args, workspace_limit_in_bytes,
            "cuda conv3d bwd filter", positive_attr, negative_attr);
}

size_t Convolution3DBackwardFilterImpl::get_workspace_in_bytes(
        const TensorLayout& src, const TensorLayout& diff,
        const TensorLayout& grad) {
    AlgoBase::SizeArgs args(this, src, diff, grad);
    return get_algorithm(this, src, diff, args.grad_filter_meta)
            ->get_workspace_in_bytes(args);
}

const char* Convolution3DBackwardFilterImpl::get_algorithm_set_name() const {
    return "CUDACONV0+CUDNN" CUDNN_VERSION_STR;
}

// vim: syntax=cpp.doxygen