feat(x86/rvv): add AGENT_NCHW_NCHW44 algo

GitOrigin-RevId: 8cf6c3fac0
2 years ago · 54b5db1729
--- a/dnn/src/fallback/conv_bias/gi/fp32/algos.h
+++ b/dnn/src/fallback/conv_bias/gi/fp32/algos.h
@@ -1,5 +1,6 @@
 #pragma once

 #include "src/common/opr_delegate.h"
 #include "src/fallback/conv_bias/opr_impl.h"
 #include "src/fallback/matrix_mul/opr_impl.h"

@@ -249,6 +250,26 @@ public:
    MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_FP32)
 };

 class ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT final : public AlgoBase {
    SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;

 public:
    AlgoF32DirectNCHWNCHW44AGENT(){};
    AlgoAttribute attribute() const override { return AlgoAttribute::REPRODUCIBLE; }
    const char* name() const override { return "F32_CONV_AGENT_NCHW_NCHW44"; }
    bool usable(
            const NCBKernSizeParam& param,
            AlgoSelectionStrategy algo_selection_strategy) const override;

    size_t get_workspace(const NCBKernSizeParam& param) const override;
    virtual SmallVector<NCBKern> dispatch_kerns(
            const NCBKernSizeParam& param) const override;
    ConvAlgoTypePack get_algo_type() const override {
        return {AlgoDataType::FLOAT32, AlgoCategory::DIRECT};
    }
    MEGDNN_DECL_ALGO_TYPE(GI_COMMON_DIRECT_NCHW_NCHW44_AGENT_FP32)
 };

 class ConvBiasImpl::AlgoF32ChannelWiseNCHW44 final : public AlgoBase {
    SmallVector<NCBKern> get_kimpls(const NCBKernSizeParam& param) const;

--- a/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_agent_algo.cpp
+++ b/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_agent_algo.cpp
@@ -0,0 +1,256 @@
 #include "megdnn/opr_param_defs.h"
 #include "megdnn/oprs.h"
 #include "src/common/nchw_nchwxx_valid.h"
 #include "src/common/opr_delegate.h"
 #include "src/fallback/conv_bias/gi/fp32/algos.h"
 #include "src/fallback/elemwise_helper/elemwise_op.h"

 #include "midout.h"

 using namespace megdnn;
 using namespace fallback;

 MIDOUT_DECL(megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent)

 namespace {

 param::ConvBias get_param_convbias(const ConvBiasImpl::NCBKernSizeParam& p) {
    param::ConvBias::Mode mode;
    if (p.filter_meta.should_flip) {
        mode = param::ConvBias::Mode::CONVOLUTION;
    } else {
        mode = param::ConvBias::Mode::CROSS_CORRELATION;
    }

    return param::ConvBias{
            p.nonlineMode,
            mode,
            param::ConvBias::Sparse::DENSE,
            ConvBias::Param::Format::NCHW,
            p.filter_meta.padding[0],
            p.filter_meta.padding[1],
            p.filter_meta.stride[0],
            p.filter_meta.stride[1],
            p.filter_meta.dilation[0],
            p.filter_meta.dilation[1],
            megdnn::param::ConvBias::ComputeMode::DEFAULT};
 }

 TensorLayoutArray get_layouts(const ConvBiasImpl::NCBKernSizeParam& p) {
    UNPACK_CONV_NCB_KERN_SIZES(p);
    MEGDNN_MARK_USED_VAR(SH);
    MEGDNN_MARK_USED_VAR(SW);
    MEGDNN_MARK_USED_VAR(PH);
    MEGDNN_MARK_USED_VAR(PW);
    MEGDNN_MARK_USED_VAR(OW);
    MEGDNN_MARK_USED_VAR(OH);
    TensorLayout src_layout({N, IC, IH, IW}, p.src_type);
    //! 44 filter to chw
    TensorLayout filter_layout44({OC / 4, FH, FW, IC, 4}, p.filter_type);
    TensorLayout filter_layout_reshape({OC / 4, 4, IC, FH, FW}, p.filter_type);
    TensorLayout filter_layout({OC, IC, FH, FW}, p.filter_type);

    TensorLayout bias_layout44{{}, p.bias_type};
    TensorLayout bias_layout{{}, p.bias_type};
    TensorLayout bias_layout_reshape{{}, p.bias_type};
    if (p.bias_mode == BiasMode::BROADCAST_CHANNEL_BIAS) {
        bias_layout44 = TensorLayout({1, OC / 4, 1, 1, 4}, p.bias_type);
        bias_layout_reshape = TensorLayout({1, OC / 4, 4, 1, 1}, p.bias_type);
        bias_layout = TensorLayout({1, OC, 1, 1}, p.bias_type);
    }
    //! chw dst to 44
    TensorLayout dst_layout = TensorLayout({N, OC, OH, OW}, p.dst_type);
    TensorLayout dst_layout_reshape = TensorLayout({N, OC / 4, 4, OH, OW}, p.dst_type);
    TensorLayout dst_layout44 = TensorLayout({N, OC / 4, OH, OW, 4}, p.dst_type);

    return {src_layout,        filter_layout,         filter_layout44,
            bias_layout,       bias_layout44,         dst_layout,
            dst_layout44,      filter_layout_reshape, bias_layout_reshape,
            dst_layout_reshape};
 }

 static WorkspaceBundle get_bundle(
        const ConvBiasImpl::NCBKernSizeParam& param,
        const std::unique_ptr<ConvBias>& conv_bias_op) {
    auto layouts = get_layouts(param);
    auto src_layout = layouts[0];
    auto filter_layout = layouts[1];
    auto bias_layout = layouts[3];
    auto dst_layout = layouts[5];
    size_t weight_relayout_workspace = filter_layout.span().dist_byte();
    size_t bias_relayout_workspace = bias_layout.span().dist_byte();
    conv_bias_op->param() = get_param_convbias(param);
    auto dummy = TensorLayout();
    auto conv_workspace = conv_bias_op->get_workspace_in_bytes(
            src_layout, filter_layout, bias_layout, dummy, dst_layout, nullptr);
    auto conv_dst_workspace = dst_layout.span().dist_byte();

    return {nullptr,
            {weight_relayout_workspace, bias_relayout_workspace, conv_workspace,
             conv_dst_workspace}};
 };
 };  // namespace

 namespace {
 inline bool is_usable(
        const DTypeEnum src_dtype, const DTypeEnum filter_dtype,
        const DTypeEnum dst_dtype,
        const ConvolutionBase<param::Convolution>::CanonizedFilterMeta& fm,
        const BiasMode bias_mode, const param::ConvBias::NonlineMode nonline_mode) {
    bool ok_type =
            ((src_dtype == DTypeEnum::Float32 && filter_dtype == DTypeEnum::Float32 &&
              (dst_dtype == DTypeEnum::Float32))) &&
            (fm.format == param::Convolution::Format::NCHW44);
    bool ok_nonline = nonline_mode == param::ConvBias::NonlineMode::IDENTITY ||
                      nonline_mode == param::ConvBias::NonlineMode::RELU ||
                      nonline_mode == param::ConvBias::NonlineMode::SIGMOID ||
                      nonline_mode == param::ConvBias::NonlineMode::H_SWISH;
    bool ok_src_dst =
            fm.icpg < 4 && (fm.ocpg % 4 == 0 && fm.ocpg >= 4) && fm.group == 1;

    bool ok_filter = fm.spatial_ndim == 2 && fm.spatial[0] == fm.spatial[1] &&
                     (fm.spatial[0] == 2 || fm.spatial[0] == 3 || fm.spatial[0] == 5 ||
                      fm.spatial[0] == 7);
    bool ok_slide = fm.dilation[0] == 1 && fm.dilation[1] == 1 &&
                    fm.stride[0] == fm.stride[1] &&
                    (fm.stride[0] == 1 || fm.stride[1] == 2);
    bool ok_conv = !fm.should_flip && bias_mode != BiasMode::BIAS;
    bool avaible =
            ok_type && ok_nonline && ok_src_dst && ok_filter && ok_slide && ok_conv;
    return avaible;
 }
 };  // namespace

 bool ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::usable(
        const NCBKernSizeParam& param, AlgoSelectionStrategy) const {
    return is_usable(
            param.src_type.enumv(), param.filter_type.enumv(), param.dst_type.enumv(),
            param.filter_meta, param.bias_mode, param.nonlineMode);
 }

 size_t ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::get_workspace(
        const NCBKernSizeParam& param) const {
    MIDOUT_BEGIN(
            megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent,
            midout_iv("AlgoF32DirectNCHWNCHW44AGENT::get_workspace"_hash)) {
        auto conv_bias_op = param.handle->create_operator<ConvBias>();
        return get_bundle(param, conv_bias_op).total_size_in_bytes();
    }
    MIDOUT_END();
    return 0;
 }

 SmallVector<ConvBiasImpl::NCBKern> ConvBiasImpl::AlgoF32DirectNCHWNCHW44AGENT::
        dispatch_kerns(const NCBKernSizeParam& k_param) const {
    SmallVector<ConvBiasImpl::NCBKern> ret_kerns;

    MIDOUT_BEGIN(
            megdnn_fallback_conv_bias_fp32_nchw_nchw44_agent,
            midout_iv("AlgoF32DirectNCHWNCHW44AGENT::dispatch_kerns"_hash)) {
        auto filter_and_bias_dimshuffle = [](const NCBKernParam& kern_param,
                                             const NCBKernIndex&) {
            auto layouts = get_layouts(kern_param);
            auto filter_layout_44 = layouts[2];
            auto bias_layout44 = layouts[4];
            auto filter_layout_reshape = layouts[7];
            auto bias_layout_reshape = layouts[8];

            auto conv_bias_op = kern_param.handle->create_operator<ConvBias>();
            auto bundle = get_bundle(kern_param, conv_bias_op);
            bundle.set(kern_param.workspace_ptr);
            auto weight_ws = bundle.get(0);
            auto bias_ws = bundle.get(1);

            //! relayout bias and weight
            TensorND chw_weight_t = TensorND(weight_ws, filter_layout_reshape);
            TensorND weight44_t = TensorND(
                    kern_param.filter_ptr.get_ptr(),
                    filter_layout_44.dimshuffle({0, 4, 3, 1, 2}));
            auto relayout_op = inplace_cpu_handle()->create_operator<Relayout>();
            relayout_op->exec(weight44_t, chw_weight_t);

            TensorND chw_bias_t = TensorND(bias_ws, bias_layout_reshape);
            if (bias_layout44.ndim != 0) {
                TensorND bias44_t = TensorND(
                        kern_param.bias_ptr.get_ptr(),
                        bias_layout44.dimshuffle({0, 1, 4, 2, 3}));
                relayout_op->exec(bias44_t, chw_bias_t);
            }
        };
        ret_kerns.push_back({filter_and_bias_dimshuffle, {1}});

        auto do_agent_conv = [&ret_kerns, &k_param]() {
            auto layouts = get_layouts(k_param);
            auto src_layout = layouts[0];
            auto filter_layout = layouts[1];
            auto bias_layout = layouts[3];
            auto dst_layout = layouts[5];

            //! do chw conv
            auto conv_bias_op = k_param.handle->create_operator<ConvBias>();
            conv_bias_op->param() = get_param_convbias(k_param);
            auto dummy_z = TensorND();
            auto&& conv_bias_algo =
                    static_cast<ConvBiasImpl*>(conv_bias_op.get())
                            ->get_algorithm_heuristic(
                                    src_layout, filter_layout, bias_layout,
                                    dummy_z.layout, dst_layout,
                                    std::numeric_limits<size_t>::max(),
                                    AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT);
            auto new_param = k_param;
            new_param.filter_meta.format = ConvBias::Param::Format::NCHW;
            auto&& conv_bias_kerns =
                    static_cast<AlgoBase*>(conv_bias_algo)->dispatch_kerns(new_param);
            for (size_t i = 0; i < conv_bias_kerns.size(); i++) {
                auto&& kernel = conv_bias_kerns[i];
                auto run = [kernel](
                                   const NCBKernParam& p,
                                   const NCBKernIndex& ncb_index) {
                    auto conv_bias_op = p.handle->create_operator<ConvBias>();
                    auto bundle = get_bundle(p, conv_bias_op);
                    bundle.set(p.workspace_ptr);
                    auto weight_ws = bundle.get(0);
                    auto bias_ws = bundle.get(1);
                    auto chw_conv_ws = bundle.get(2);
                    auto chw_conv_ws_size = bundle.get_size(2);
                    auto chw_conv_dst_ws = bundle.get(3);

                    auto param = p;
                    param.filter_ptr = weight_ws;
                    param.bias_ptr = bias_ws;
                    param.dst_ptr = chw_conv_dst_ws;
                    param.workspace_ptr = chw_conv_ws;
                    param.workspace_size = chw_conv_ws_size;
                    kernel.kern(param, {ncb_index.thread_id, ncb_index.ndrange_id});
                };
                ret_kerns.push_back({run, kernel.global_size});
            }
        };
        do_agent_conv();

        auto dest_dimshuffle = [](const NCBKernParam& kern_param, const NCBKernIndex&) {
            auto param = kern_param;
            auto layouts = get_layouts(param);
            auto dst_layout44 = layouts[6];
            auto dst_layout_reshape = layouts[9];

            auto conv_bias_op = kern_param.handle->create_operator<ConvBias>();
            auto bundle = get_bundle(kern_param, conv_bias_op);
            bundle.set(kern_param.workspace_ptr);
            auto chw_conv_dst_ws = bundle.get(3);

            //! relayout dst to dst44 tensor
            TensorND chw44_dst_t = TensorND(kern_param.dst_ptr.get_ptr(), dst_layout44);
            TensorND chw_dst_t = TensorND(chw_conv_dst_ws, dst_layout_reshape);
            auto relayout_op = inplace_cpu_handle()->create_operator<Relayout>();
            relayout_op->exec(
                    {chw_conv_dst_ws, dst_layout_reshape.dimshuffle({0, 1, 3, 4, 2})},
                    chw44_dst_t);
        };
        ret_kerns.push_back({dest_dimshuffle, {1}});
        return ret_kerns;
    }
    MIDOUT_END();
 }

 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp
+++ b/dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp
@@ -1,16 +1,3 @@
 /**
 * \file
 dnn/src/fallback/conv_bias/gi/fp32/f32_direct_nchw_nchw44_algo.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express
 * or implied.
 */

 #include "megdnn/oprs.h"
 #include "src/common/nchw_nchwxx_valid.h"
 #include "src/common/opr_delegate.h"
--- a/dnn/src/fallback/conv_bias/opr_impl.cpp
+++ b/dnn/src/fallback/conv_bias/opr_impl.cpp
@@ -84,7 +84,8 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {
    AlgoBase::Mapper m_all_algos_map;
    SmallVector<fallback::ConvBiasImpl::AlgoBase*> m_gi_winograd_algos;

    AlgoF32DirectNCHWNCHW44 f32_direct_stride2_nchw_nchw44;
    AlgoF32DirectNCHWNCHW44 f32_nchw_nchw44;
    AlgoF32DirectNCHWNCHW44AGENT f32_nchw_nchw44_agent;
    AlgoF32ChannelWiseNCHW44 f32_chanel_wise_nchw44;
    AlgoF32DirectNCHW44 f32_direct_nchw44;

@@ -94,8 +95,17 @@ class ConvBiasImpl::AlgoPack : NonCopyableObj {

 public:
    AlgoPack() {
        // fallback gi fp32 algo
        m_all_algos.emplace_back(&f32_direct_stride2_nchw_nchw44);
        //! fallback gi fp32 algo
        //! now f32_nchw_nchw44_agent is fast than f32_nchw_nchw44
        //! on x86 and rvv platform, so we adjust heuristic order.
 #if MEGDNN_AARCH64 || MEGDNN_ARMV7
        m_all_algos.emplace_back(&f32_nchw_nchw44);
        m_all_algos.emplace_back(&f32_nchw_nchw44_agent);
 #else
        m_all_algos.emplace_back(&f32_nchw_nchw44_agent);
        m_all_algos.emplace_back(&f32_nchw_nchw44);
 #endif

        m_all_algos.emplace_back(&f32_chanel_wise_nchw44);
        m_all_algos.emplace_back(&f32_direct_nchw44);
        m_all_algos.emplace_back(&f32_direct_stride1);
@@ -471,7 +481,8 @@ ConvBiasImpl::NCBKernSizeParam ConvBiasImpl::make_ncb_kern_size_param(
             param().compute_mode,
             nr_threads,
             reinterpret_cast<const ConvolutionForward::PreprocessedFilter*>(
                     preprocessed_filter)},
                     preprocessed_filter),
             handle()},
            bias.dtype,
            bias.stride[0],
            bias_mode,
@@ -491,6 +502,7 @@ ConvBiasImpl::NCBKernParam ConvBiasImpl::make_ncb_kern_param(
    ret.dst_ptr = dst.get_ref_ptr();
    ret.workspace_ptr = workspace.raw_ptr;
    ret.workspace_size = workspace.size;
    ret.handle = handle();
    return ret;
 }

--- a/dnn/src/fallback/conv_bias/opr_impl.h
+++ b/dnn/src/fallback/conv_bias/opr_impl.h
@@ -228,6 +228,7 @@ public:
            GI_COMMON_DIRECT_STRD2_FP32,
            GI_COMMON_DIRECT_NCHW44_FP32,
            GI_COMMON_DIRECT_NCHW_NCHW44_FP32,
            GI_COMMON_DIRECT_NCHW_NCHW44_AGENT_FP32,
            GI_COMMON_CHWNWISE_NCHW44_F32,

 #if MEGDNN_X86
@@ -389,6 +390,7 @@ private:
    class AlgoF32DirectStride1;
    class AlgoF32DirectStride2;
    class AlgoF32DirectNCHWNCHW44;
    class AlgoF32DirectNCHWNCHW44AGENT;
    class AlgoF32ChannelWiseNCHW44;
    class AlgoF32DirectNCHW44;

--- a/dnn/src/fallback/convolution/opr_impl.cpp
+++ b/dnn/src/fallback/convolution/opr_impl.cpp
@@ -242,7 +242,8 @@ ConvolutionImpl::NCBKernSizeParam ConvolutionImpl::make_ncb_kern_size_param(
            {dst.stride[0], dst.stride[1], dst.stride[2], dst.stride[3]},
            param().compute_mode,
            nr_threads,
            preprocessed_filter};
            preprocessed_filter,
            handle()};
 }

 ConvolutionImpl::NCBKernParam ConvolutionImpl::make_ncb_kern_param(
--- a/dnn/src/fallback/convolution/opr_impl.h
+++ b/dnn/src/fallback/convolution/opr_impl.h
@@ -101,6 +101,7 @@ public:
        const PreprocessedFilter* preprocessed_filter;
        //! get the data type category of the param for select the algo
        AlgoDataType deduce_algo_data_type() const;
        Handle* handle;
    };

    //! memory param for kernels with non-contiguous batch
--- a/dnn/test/fallback/conv_bias.cpp
+++ b/dnn/test/fallback/conv_bias.cpp
@@ -354,6 +354,31 @@ TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S1) {
            handle(), "F32_CONV_NCHW_NCHW44");
 }

 #define CB(_MODE, _SUFFIX)                                                            \
    TEST_F(FALLBACK_MULTI_THREADS, CONVBIAS_GI_NCHW_NCHW44_F32_S2_AGENT_##_SUFFIX) {  \
        check_conv_bias(                                                              \
                conv_bias::get_nchw44_conv_bias_args(                                 \
                        {2, 3, 5, 7}, ONLY_IDENTITY_NLMODE, {_MODE}, 2, false, true), \
                handle(), "F32_CONV_AGENT_NCHW_NCHW44");                              \
    }
 CB(megdnn::BiasMode::NO_BIAS, NO_BIAS);
 CB(megdnn::BiasMode::BROADCAST_CHANNEL_BIAS, BROADCAST_CHANNEL_BIAS);
 #undef CB

 #define CB(_MODE, _SUFFIX)                                                        \
    TEST_F(FALLBACK_MULTI_THREADS,                                                \
           CONVBIAS_GI_NCHW_NCHW44_F32_S1_AGENT_IDENTITY_##_SUFFIX) {             \
        check_conv_bias(                                                          \
                conv_bias::get_nchw44_conv_bias_args(                             \
                        {2, 3, 5, 7}, {_MODE}, ONLY_BR_BIASMODE, 1, false, true), \
                handle(), "F32_CONV_AGENT_NCHW_NCHW44");                          \
    }
 CB(param::ConvBias::NonlineMode::IDENTITY, IDENTITY);
 CB(param::ConvBias::NonlineMode::RELU, RELU);
 CB(param::ConvBias::NonlineMode::H_SWISH, H_SWISH);
 CB(param::ConvBias::NonlineMode::SIGMOID, SIGMOID);
 #undef CB

 std::vector<conv_bias::TestArg> get_nchw44_channel_wise_args(
        std::vector<size_t> kernel, size_t stride, bool no_bias, bool no_nonlinemode,
        bool no_full_bias) {