feat(cuda): add large kernel direct conv to heuristic algo chooser

GitOrigin-RevId: bc927b6df7
3 years ago · f7994683bd
--- a/dnn/src/cuda/conv_bias/opr_impl.cpp
+++ b/dnn/src/cuda/conv_bias/opr_impl.cpp
@@ -148,7 +148,9 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
    //! choose for large kernel cases
    size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
    size_t hi = src[2], wi = src[3];
    const bool prefer_dnn_lk_implbmm = hi <= 2 * fh && wi <= 2 * fw;
    const bool prefer_dnn_lk_implbmm =
            hi <= 2 * fh && wi <= 2 * fw && wi < 32 && hi <= 32;
    const bool prefer_direct_lk = fh > 10 && fw > 10;
    //! avoid bad case in cudnn, check dnn chanwise impl first
    if (is_chanwise) {
        if (prefer_dnn_lk_implbmm) {
@@ -160,6 +162,11 @@ ConvBiasForward::Algorithm* ConvBiasForwardImpl::get_algorithm_heuristic(
            if (sm_algo_pack.f32_implicit_bmm[0].is_available_attribute(
                        args, positive_attr, negative_attr, workspace_limit_in_bytes))
                return &sm_algo_pack.f32_implicit_bmm[0];
        } else if (
                prefer_direct_lk &&
                sm_algo_pack.depthwise_large_filter.is_available_attribute(
                        args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
            return &sm_algo_pack.depthwise_large_filter;
        } else if (prefer_dnn_chanwise) {
            if (sm_algo_pack.chanwise.is_available_attribute(
                        args, positive_attr, negative_attr, workspace_limit_in_bytes))
--- a/dnn/src/cuda/convolution/opr_impl.cpp
+++ b/dnn/src/cuda/convolution/opr_impl.cpp
@@ -119,7 +119,10 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::
    size_t fh = args.filter_meta.spatial[0], fw = args.filter_meta.spatial[1];
    size_t ho = diff[2], wo = diff[3];
    const bool prefer_dnn_lk_implbmm = args.filter_meta.format == Param::Format::NCHW &&
                                       ho <= 2 * fh && wo <= 2 * fw;
                                       ho <= 2 * fh && wo <= 2 * fw && ho < 32 &&
                                       wo < 32;
    const bool prefer_direct_lk =
            args.filter_meta.format == Param::Format::NCHW && fh > 10 && fw > 10;
    if (prefer_dnn_lk_implbmm) {
 #if CUDA_VERSION >= 10020
        if (sm_algo_pack.implbmm_nchw_hmma[0].is_available_attribute(
@@ -131,6 +134,12 @@ ConvolutionBackwardDataImpl::Algorithm* ConvolutionBackwardDataImpl::
            return &sm_algo_pack.implbmm_nchw_fma[0];
    }

    if (prefer_direct_lk &&
        sm_algo_pack.depthwise_large_filter.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes)) {
        return &sm_algo_pack.depthwise_large_filter;
    }

    if (args.filter_meta.group > 1 &&
        sm_algo_pack.chanwise.is_available_attribute(
                args, positive_attr, negative_attr, workspace_limit_in_bytes)) {