feat(opr): use weight preprocess feature of MegDNN

GitOrigin-RevId: 779041f8a8
4 years ago · 75eebb7c42
--- a/dnn/include/megdnn/oprs/nn.h
+++ b/dnn/include/megdnn/oprs/nn.h
@@ -51,6 +51,17 @@ protected:
 };
 using SeparableConv = SeparableConvForward;

 namespace detail {

 struct PreprocessedFilter {
    //! user data; its lifetime should be bound to MegDNN Convolution
    //! operator
    void* algorithm_id;
    TensorNDArray tensors;
 };

 }  // namespace intl

 /**
 * \brief base class for convolution operation
 *
@@ -131,13 +142,7 @@ public:
            return flag;
        }
    };

    struct PreprocessedFilter {
        //! user data; its lifetime should be bound to MegDNN Convolution
        //! operator
        void* algorithm_id;
        TensorNDArray tensors;
    };
    using PreprocessedFilter = detail::PreprocessedFilter;

 protected:
    // Check or deduce output DType
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -10,6 +10,7 @@
 */

 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/io.h"

 #include "megbrain/graph/grad_impl.h"
 #include "megbrain/system.h"
@@ -95,67 +96,14 @@ MGB_FOREACH_FASTRUN_OPR(cb)

 #undef cb

 template <class MGBOpr>
 struct OprAttributeTrait {
    static bool is_weights_persistent(const MGBOpr*) { return false; }
 };

 template <>
 struct OprAttributeTrait<opr::ConvBias> {
    //! return true if the flag of weights is PERSISTENT_DEVICE_VALUE, false
    //! otherwise. True means weights can be tranformed in the first run.
    static bool is_weights_persistent(const opr::ConvBias* opr) {
        return opr->input()[1]->contain_flag(
                VarNode::Flag::PERSISTENT_DEVICE_VALUE);
    }
 };

 template <typename Opr>
 constexpr bool opr_supports_preprocess() {
    return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
           std::is_same<Opr, megdnn::ConvBias>::value;
 }

 template <typename Opr>
 struct OprArityTrait;

 #define APPLY(statement, ...)                                  \
    mgb::apply([&](const auto&... args) { return statement; }, \
               std::tuple_cat(__VA_ARGS__))

 template <typename Opr, int _arity_in, int _arity_out>
 struct OprArityTraitTmpl {
    static constexpr int arity_in = _arity_in;
    static constexpr int arity_out = _arity_out;
    static constexpr int arity = arity_in + arity_out;
    using Algorithm = typename Opr::Algorithm;
    using TensorLayoutArray = std::array<TensorLayout, arity>;

    static size_t get_workspace_in_bytes(Opr* opr, Algorithm* algo,
                                         const TensorLayoutArray& layouts) {
        opr->execution_policy() = {algo};
        size_t workspace_size;
        if_constexpr<opr_supports_preprocess<Opr>()>([&](auto) {
            workspace_size = APPLY(
                    opr->get_workspace_in_bytes(args..., nullptr), layouts);
        }, /* else */ [&](auto) {
            workspace_size =
                    APPLY(opr->get_workspace_in_bytes(args...), layouts);
        });
        return workspace_size;
    }

    static void exec(Opr* opr,
                     const std::array<DeviceTensorND, arity_in>& inp_val,
                     const std::array<DeviceTensorND, arity_out>& out_val,
                     megdnn::Workspace& workspace) {
        if_constexpr<opr_supports_preprocess<Opr>()>([&](auto) {
            APPLY(opr->exec(args.as_megdnn()..., nullptr, workspace), inp_val,
                   out_val);
        }, /* else */ [&](auto) {
            APPLY(opr->exec(args.as_megdnn()..., workspace), inp_val, out_val);
        });
    }
 };

 #define INST_ARITY(_Opr, _in, _out) \
@@ -179,6 +127,26 @@ INST_ARITY(megdnn::DeformableConvBackwardData, 5, 3);

 #undef INST_ARITY

 template <typename Opr>
 constexpr bool opr_supports_preprocess() {
    return std::is_same<Opr, megdnn::ConvolutionForward>::value ||
           std::is_same<Opr, megdnn::ConvBias>::value;
 }

 template <typename Opr, bool has_prep>
 struct PreprocessFilterImpl {
    using T = union {};
 };

 template <typename Opr>
 struct PreprocessFilterImpl<Opr, true> {
    using T = typename Opr::PreprocessedFilter;
 };

 template <typename Opr>
 using PreprocessFilter =
        typename PreprocessFilterImpl<Opr, opr_supports_preprocess<Opr>()>::T;

 // timeout delta to be added with fastest known algorithm for new algos
 constexpr double TIMEOUT_TOLERANCE = 2;

@@ -225,6 +193,7 @@ public:
        CompNode::Locator comp_node_loc;
        ConvTensorShapes shapes;
        typename Opr::Param opr_param;
        bool allow_weight_preprocess;

        //! filled by profile()
        mutable double actual_timeout;
@@ -277,6 +246,10 @@ double TimedProfiler<Opr>::init_timeout_setting() {
    return 0;
 }

 #define APPLY(statement, ...)                                  \
    mgb::apply([&](const auto&... args) { return statement; }, \
               std::tuple_cat(__VA_ARGS__))

 template <typename Opr>
 typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        const TParam& raw_param) {
@@ -324,6 +297,16 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        megdnn_opr->execution_policy() = {algo};
    }

    // Allocate preprocessed weight buffers.
    TensorLayoutArray preprocessed_layout;
    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
        if (param.allow_weight_preprocess) {
            preprocessed_layout = APPLY(
                    _(megdnn_opr)->deduce_preprocessed_filter_layout(args...),
                    layouts);
        }
    });

    {
        // first allocate a whole chunk to avoid memory fragmentation (here we
        // rely on memory allocator to reuse memory)
@@ -332,6 +315,9 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        for (int i = 0; i < arity; ++i) {
            tot_size += layouts[i].span().high_byte + align;
        }
        for (const auto& layout : preprocessed_layout) {
            tot_size += layout.span().high_byte + align;
        }
        tot_size += param.workspace;
        DeviceTensorStorage storage{cn};
        storage.ensure_size(tot_size);
@@ -362,15 +348,46 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        mdn_workspace.raw_ptr = workspace.raw_ptr();
    }

    // allocate storage for preprocessed filter
    SmallVector<DeviceTensorND> flt_val(preprocessed_layout.size());
    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
        flt_val[i] = {cn, preprocessed_layout[i], preprocessed_layout[i].dtype,
                      preprocessed_layout[i].format};
    }

    for (int i = 0; i < arity_in; ++i) {
        fill_zero_dev_tensor(inp_val[i]);
    }

    PreprocessFilter<Opr> prep_flt;
    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
        if (!preprocessed_layout.empty()) {
            auto&& pf = _(prep_flt);
            pf.algorithm_id = nullptr;
            pf.tensors.resize(flt_val.size());
            for (size_t i = 0; i < flt_val.size(); i++) {
                pf.tensors[i] = flt_val[i].as_megdnn();
            }
            APPLY(_(megdnn_opr)->exec_preprocess(args..., &pf, mdn_workspace),
                  std::forward_as_tuple(layouts[0], inp_val[1].as_megdnn()),
                  array_skip<2>(layouts));
        }
    });

    RealTimer timer;
    auto ev_start = cn.create_event(CompNode::Event::NEED_TIMER),
         ev_end = cn.create_event(CompNode::Event::NEED_TIMER);
    ev_start->record();
    OprArityTrait<Opr>::exec(megdnn_opr.get(), inp_val, out_val, mdn_workspace);
    if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
        auto&& opr = _(megdnn_opr);
        PreprocessFilter<Opr>* pf =
                preprocessed_layout.empty() ? nullptr : &prep_flt;
        APPLY(opr->exec(args.as_megdnn()..., pf, mdn_workspace), inp_val,
              out_val);
    }, /* else */ [&](auto _) {
        APPLY(_(megdnn_opr)->exec(args.as_megdnn()..., mdn_workspace), inp_val,
              out_val);
    });
    ev_end->record();

    double next_report_time = 0.5;
@@ -425,13 +442,15 @@ class AlgoChooser {
        const ConvTensorLayouts& m_layouts;
        Opr* m_megdnn_opr;
        const MGBOpr* m_mgb_opr;
        bool m_allow_weight_preprocess;

    public:
        ExeContext(const ConvTensorLayouts& layouts, Opr* megdnn_opr,
                   const MGBOpr* mgb_opr)
                   const MGBOpr* mgb_opr, bool allow_weight_preprocess)
                : m_layouts{layouts},
                  m_megdnn_opr{megdnn_opr},
                  m_mgb_opr{mgb_opr} {
                  m_mgb_opr{mgb_opr},
                  m_allow_weight_preprocess{allow_weight_preprocess} {
            mgb_assert(m_layouts.size() == layouts.size());
            static_assert(
                    std::tuple_size<ConvTensorLayouts>::value == 3 ||
@@ -499,8 +518,23 @@ class AlgoChooser {

        //! get workspace size required for specific algo
        size_t get_workspace_size_bytes(ImplAlgo algo) const {
            return OprArityTrait<Opr>::get_workspace_in_bytes(m_megdnn_opr,
                                                              algo, m_layouts);
            m_megdnn_opr->execution_policy() = {algo};
            size_t result;
            if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
                auto&& opr = _(m_megdnn_opr);
                auto prep = construct_fake_preprocess_filter();
                PreprocessFilter<Opr>* prep_ptr =
                        prep.valid() ? &prep.val() : nullptr;
                result = std::max(
                        APPLY(opr->get_preprocess_workspace_in_bytes(args...),
                              m_layouts),
                        APPLY(opr->get_workspace_in_bytes(args..., prep_ptr),
                              m_layouts));
            }, /* else */ [&](auto _) {
                result = APPLY(_(m_megdnn_opr)->get_workspace_in_bytes(args...),
                               m_layouts);
            });
            return result;
        }

        /*!
@@ -525,6 +559,28 @@ class AlgoChooser {
         */
        void modify_param_with_weights_preprocessed(
                typename TimedProfiler<Opr>::Param& param) const {}
        
        Maybe<PreprocessFilter<Opr>> construct_fake_preprocess_filter() const {
            Maybe<PreprocessFilter<Opr>> result = None;
            if_constexpr<opr_supports_preprocess<Opr>()>([&](auto _) {
                if (!m_allow_weight_preprocess)
                    return;
                auto opr = _(m_megdnn_opr);
                auto layout =
                        APPLY(opr->deduce_preprocessed_filter_layout(args...),
                              m_layouts);
                if (layout.empty())
                    return;
                result = PreprocessFilter<Opr>{};
                auto& res = result.val();
                res.algorithm_id = nullptr;
                res.tensors.resize(layout.size());
                for (size_t i = 0; i < layout.size(); i++) {
                    res.tensors[i] = megdnn::TensorND(nullptr, layout[i]);
                }
            });
            return result;
        }
    };

    //! entrance for getting algorithm according to execution strategy
@@ -571,12 +627,13 @@ public:
     * \brief setup algorithm and return workspace size
     */
    static size_t setup_algo(const ConvTensorLayouts& layouts, Opr* megdnn_opr,
                             const MGBOpr* mgb_opr) {
                             const MGBOpr* mgb_opr,
                             bool allow_weight_preprocess = false) {
        if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) {
            return 0;
        }

        ExeContext ctx(layouts, megdnn_opr, mgb_opr);
        ExeContext ctx(layouts, megdnn_opr, mgb_opr, allow_weight_preprocess);

        auto algo = get_algo(ctx);
        size_t workspace = ctx.get_workspace_size_bytes(algo);
@@ -780,9 +837,6 @@ Maybe<AlgoChooserProfileCache::ResultEntry>
 AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
                                                  double& timeout) const {
    typename TimedProfiler<Opr>::Param param;
    bool is_weights_persistent =
            OprAttributeTrait<typename MegDNNOpr2MGBOpr<Opr>::MGBOpr>::
                    is_weights_persistent(m_mgb_opr);
    auto name = algo->name();
    // force check copy size <= dest len-1 from gcc8 for safe
    auto len = sizeof(param.algo_name);
@@ -806,8 +860,9 @@ AlgoChooser<Opr>::ExeContext::profile_single_algo(ImplAlgo algo,
    for (size_t i = 0; i < param.shapes.size(); ++i)
        param.shapes[i] = m_layouts[i];
    param.opr_param = m_megdnn_opr->param();
    param.allow_weight_preprocess = m_allow_weight_preprocess;

    if (is_weights_persistent) {
    if (m_allow_weight_preprocess) {
        modify_param_with_weights_preprocessed(param);
    }

@@ -911,6 +966,78 @@ AlgoChooserProfileCache& mixin::Convolution::profile_cache() const {
    return *m_profile_cache;
 }

 class mixin::WeightPreprocessExecutor::PreprocessedFilterExecDep final
        : public cg::GraphExecutable::ExecDependency {
    std::unique_ptr<PreprocessedFilter> m_pf;
    SmallVector<DeviceTensorND> m_filter_storage;

 public:
    explicit PreprocessedFilterExecDep(
            std::unique_ptr<PreprocessedFilter> preprocessed_filter,
            SmallVector<DeviceTensorND> filter_storage)
            : m_pf(std::move(preprocessed_filter)),
              m_filter_storage(std::move(filter_storage)) {}
 };

 void mixin::WeightPreprocessExecutor::mixin_update_preprocessed_filter(
        cg::OperatorNodeBase& opr) {
    if (!mixin_allow_weight_preprocess(opr)) return;

    auto new_layout = deduce_preprocessed_filter_layout();
    if (new_layout.empty()) {
        // Weight preprocess was needed before, but no longer needed.
        if (m_preprocessed_filter) {
            m_preprocessed_filter.reset();
            m_filter_storage.clear();
        }
        return;
    }

    bool should_update = false;
    size_t new_size = new_layout.size();
    if (!m_preprocessed_filter ||
        m_preprocessed_filter->tensors.size() != new_size) {
        should_update = true;
    } else {
        for (size_t i = 0; i < new_size; i++) {
            if (!new_layout[i].eq_layout(
                        m_preprocessed_filter->tensors[i].layout)) {
                should_update = true;
                break;
            }
        }
    }
    if (!should_update) return;

    if (!m_preprocessed_filter) {
        m_preprocessed_filter.reset(new PreprocessedFilter{});
    }
    m_preprocessed_filter->tensors.resize(new_size);
    m_filter_storage.resize(new_size);
    m_preprocessed_filter->algorithm_id = nullptr;
    for (size_t i = 0; i < new_size; i++) {
        m_filter_storage[i] = {opr.output(0)->comp_node(), new_layout[i],
                               new_layout[i].dtype, new_layout[i].format};
        m_preprocessed_filter->tensors[i] = m_filter_storage[i].as_megdnn();
    }
    scn_do_execute_preprocess();
 }

 void mixin::WeightPreprocessExecutor::record_preprocessed_weight(
        cg::GraphExecutable::ExecDependencyArray& deps) {
    deps.emplace_back(new PreprocessedFilterExecDep{
            std::move(m_preprocessed_filter), std::move(m_filter_storage)});
 }

 bool mixin::WeightPreprocessExecutor::mixin_allow_weight_preprocess(
        const cg::OperatorNodeBase& opr) const {
    bool param_merged = opr.input(1)
                                ->owner_opr()
                                ->same_type<opr::MultipleDeviceTensorHolder>();
    return opr.input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE) &&
           (cg::is_const_var_value(opr.input(1)) || param_merged);
 }

 /* ==================== ConvolutionForward  ==================== */

 IMPL_CONV(ConvolutionForward, "conv_fwd");
@@ -971,7 +1098,7 @@ size_t ConvolutionForward::get_workspace_size_bytes(
                          input(0)->format()},
             {input_shapes[1], input(1)->dtype(), input(1)->format()},
             {output_shapes[0], output(0)->dtype(), output(0)->format()}},
            megdnn_opr(), this);
            megdnn_opr(), this, allow_weight_preprocess());
 }

 void ConvolutionForward::init_output_format() {
@@ -980,9 +1107,14 @@ void ConvolutionForward::init_output_format() {
 }

 void ConvolutionForward::scn_do_execute() {
    if (input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE) &&
        cg::is_const_var_value(input(1))) {
        update_preprocessed_filter();
    }
    megdnn_opr()->exec(input(0)->dev_tensor().as_megdnn(),
                       input(1)->dev_tensor().as_megdnn(),
                       output(0)->dev_tensor().as_megdnn(), nullptr,
                       output(0)->dev_tensor().as_megdnn(),
                       preprocessed_filter(),
                       intl::get_megdnn_workspace_from_var(output().back()));
 }

@@ -1012,6 +1144,20 @@ void ConvolutionForward::get_output_var_shape(
 void ConvolutionForward::record_execute_deps(
        cg::GraphExecutable::ExecDependencyArray& deps) {
    record_megdnn_opr(deps);
    record_preprocessed_weight(deps);
 }

 SmallVector<TensorLayout>
 ConvolutionForward::deduce_preprocessed_filter_layout() {
    return megdnn_opr()->deduce_preprocessed_filter_layout(
            input(0)->layout(), input(1)->layout(), output(0)->layout());
 }

 void ConvolutionForward::scn_do_execute_preprocess() {
    megdnn_opr()->exec_preprocess(
            input(0)->layout(), input(1)->dev_tensor().as_megdnn(),
            output(0)->layout(), preprocessed_filter(),
            intl::get_megdnn_workspace_from_var(output().back()));
 }

 /* ==================== ConvolutionBackwardData  ==================== */
@@ -1504,10 +1650,12 @@ size_t ConvBiasForward::get_workspace_size_bytes(
             i2,
             i3,
             {output_shapes[0], output(0)->dtype(), output(0)->format()}},
            mo, this);
            mo, this, allow_weight_preprocess());
 }

 void ConvBiasForward::scn_do_execute() {
    update_preprocessed_filter();

    auto&& inp = input();
    auto mo = megdnn_opr();
    if (inp.size() == 2) {
@@ -1621,6 +1769,33 @@ megdnn::param::MatrixMul::Format ConvBiasForward::get_matmul_format(
    }
 }

 SmallVector<TensorLayout> ConvBiasForward::deduce_preprocessed_filter_layout() {
    TensorLayout i2, i3;
    if (input().size() > 2) {
        i2 = input(2)->layout();
    }
    if (input().size() > 3) {
        i3 = input(3)->layout();
    }
    return megdnn_opr()->deduce_preprocessed_filter_layout(
            input(0)->layout(), input(1)->layout(), i2, i3,
            output(0)->layout());
 }

 void ConvBiasForward::scn_do_execute_preprocess() {
    TensorLayout bias_layout(output(0)->dtype()), z_layout(output(0)->dtype());
    if (input().size() > 2) {
        bias_layout = input(2)->layout();
    }
    if (input().size() > 3) {
        z_layout = input(3)->layout();
    }
    megdnn_opr()->exec_preprocess(
            input(0)->layout(), input(1)->dev_tensor().as_megdnn(), bias_layout,
            z_layout, output(0)->layout(), preprocessed_filter(),
            intl::get_megdnn_workspace_from_var(output().back()));
 }

 /* ===================== LocalShareForward ==================== */

 IMPL_CONV(LocalShareForward, "local_share");
--- a/src/opr/include/megbrain/opr/dnn/convolution.h
+++ b/src/opr/include/megbrain/opr/dnn/convolution.h
@@ -72,13 +72,52 @@ class Convolution {
                cg::OperatorNodeBase* self);
 };

 class WeightPreprocessExecutor : public cg::OperatorNodeMixinBase {
    class PreprocessedFilterExecDep;

    using PreprocessedFilter = megdnn::detail::PreprocessedFilter;
    std::unique_ptr<PreprocessedFilter> m_preprocessed_filter;
    SmallVector<DeviceTensorND> m_filter_storage;
 protected:
    //! this should only be called in scn_do_execute or similar functions (i.e.
    //! post dispatch-to-ExecEnv)
    void mixin_update_preprocessed_filter(OperatorNodeBase& opr);
    void record_preprocessed_weight(
            cg::GraphExecutable::ExecDependencyArray& deps);
    PreprocessedFilter* preprocessed_filter() const {
        return m_preprocessed_filter.get();
    }

    bool mixin_allow_weight_preprocess(const OperatorNodeBase& opr) const;
    virtual SmallVector<TensorLayout> deduce_preprocessed_filter_layout() = 0;
    virtual void scn_do_execute_preprocess() = 0;
 };

 } // namespace mixin

 namespace intl {
    //! glue class to apply mixin::WeightPreprocessExecutor
    template<class Base = cg::OperatorNodeBase,
             class MixinImpl = mixin::WeightPreprocessExecutor>
    class OprWithWeightPreprocess: public mixin::CheckBase<Base>::Base,
                                   public MixinImpl {
    protected:
        using Base::Base;

        void update_preprocessed_filter() {
            this->mixin_update_preprocessed_filter(*this);
        }

        bool allow_weight_preprocess() const {
            return this->mixin_allow_weight_preprocess(*this);
        }
    };

    using ConvBiasBase = cg::SingleCNOperatorNode<
            cg::OutshapePureByInshapeOpr<>,
            mixin::MegDNNOprHolderImpl<megdnn::ConvBiasForward>>;
    using ConvBiasForwardBase = WorkspaceSizeInfer<ConvBiasBase>;
    using ConvBiasForwardBase =
            OprWithWeightPreprocess<WorkspaceSizeInfer<ConvBiasBase>>;

    using DeformableConvBackwardDataT = cg::SingleCNOperatorNode<
            cg::OutshapePureByInshapeOpr<>,
@@ -90,12 +129,20 @@ namespace intl {
            mixin::MegDNNOprHolderImpl<megdnn::BatchConvBiasForward>>;
    using BatchConvBiasForwardBase = WorkspaceSizeInfer<BatchConvBiasBase>;

    using ConvolutionForwardBase = WorkspaceSizeInfer<
            typename MegDNNOprWrapperFwdBase<megdnn::ConvolutionForward>::Base>;
    using ConvolutionForwardBase = OprWithWeightPreprocess<
            WorkspaceSizeInfer<typename MegDNNOprWrapperFwdBase<
                    megdnn::ConvolutionForward>::Base>>;
 }  // namespace intl

 namespace testing {

 class ConvolutionTestingPeer;

 }  // namespace testing

 MGB_DEFINE_OPR_CLASS(ConvolutionForward,
        intl::ConvolutionForwardBase, public mixin::Convolution) // {

    void init_profile_cache() override;
    void init_output_dtype() override;
    size_t get_workspace_size_bytes(
@@ -109,6 +156,10 @@ MGB_DEFINE_OPR_CLASS(ConvolutionForward,
                              TensorShapeArray& out_shape) const override final;
    void record_execute_deps(
            cg::GraphExecutable::ExecDependencyArray& deps) override;
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout() override;
    void scn_do_execute_preprocess() override;

    friend testing::ConvolutionTestingPeer;

    public:
        ConvolutionForward(VarNode *src, VarNode *filter,
@@ -142,7 +193,10 @@ MGB_DEFINE_OPR_CLASS(ConvBiasForward, intl::ConvBiasForwardBase,
    void record_execute_deps(
            cg::GraphExecutable::ExecDependencyArray& deps) override {
        this->record_megdnn_opr(deps);
        this->record_preprocessed_weight(deps);
    }
    SmallVector<TensorLayout> deduce_preprocessed_filter_layout() override;
    void scn_do_execute_preprocess() override;

 public:
    //! src * filter
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -21,6 +21,8 @@
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/tensor_manip.h"

 #include <gmock/gmock.h>

 #include <cmath>
 #include <random>

@@ -244,7 +246,6 @@ opr::Convolution::Param convert_to_conv_param(
            param.dilate_w, param.sparse,   param.format};
 };
 #endif
 } // anonymous namespace

 TEST(TestOprDNN, ConvolutionForward) {
    uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2;
@@ -1172,6 +1173,7 @@ TEST(TestOprDNN, ConvBiasForward) {
                          {1, OC, 1, 1}},
                         opt3);
        };
        run(1, 1, 1, 5, 5, 1, 1);
        run(1, 1, 1, 5, 5, 3, 3);
        run(2, 3, 4, 5, 5, 3, 3);
        run(3, 3, 4, 224, 223, 3, 3);
@@ -2124,4 +2126,225 @@ TEST(TestOprDNN, ConvolutionMultiCompNode) {

 #endif

 }  // anonymous namespace

 namespace mgb {
 namespace opr {
 namespace testing {

 class ConvolutionTestingPeer {
    opr::ConvolutionForward& m_conv_opr;
 public:
    explicit ConvolutionTestingPeer(cg::OperatorNodeBase* opr)
            : m_conv_opr(opr->cast_final_safe<opr::ConvolutionForward>()) {}
    void set_megdnn_opr(
            std::unique_ptr<megdnn::ConvolutionForward> megdnn_opr) {
        m_conv_opr.set_megdnn_opr(std::move(megdnn_opr));
    }
 };

 }  // namespace testing
 }  // namespace opr
 }  // namespace mgb

 namespace {

 using megdnn::TensorND;
 using megdnn::Workspace;
 using opr::testing::ConvolutionTestingPeer;

 class MockConvolutionForward : public megdnn::ConvolutionForward {
    const char* m_algorithm_set_name;
 public:
    MockConvolutionForward(megdnn::ConvolutionForward* orig,
                           const char* algo_set_name)
            : megdnn::ConvolutionForward(orig->handle()),
              m_algorithm_set_name(algo_set_name) {}

    MOCK_METHOD5(exec, void(_megdnn_tensor_in src, _megdnn_tensor_in filter,
                            _megdnn_tensor_out dst,
                            const PreprocessedFilter* preprocessed_filter,
                            _megdnn_workspace workspace));
    MOCK_METHOD5(exec_preprocess,
                 void(const TensorLayout& src_layout, _megdnn_tensor_in filter,
                      const TensorLayout& dst_layout,
                      PreprocessedFilter* preprocessed_filter,
                      _megdnn_workspace workspace));
    MOCK_METHOD4(get_workspace_in_bytes,
                 size_t(const TensorLayout& src, const TensorLayout& filter,
                        const TensorLayout& dst,
                        const PreprocessedFilter* preprocessed_filter));
    MOCK_METHOD3(deduce_preprocessed_filter_layout,
                 SmallVector<TensorLayout>(const TensorLayout& src,
                                           const TensorLayout& filter,
                                           const TensorLayout& dst));
    MOCK_METHOD3(get_preprocess_workspace_in_bytes,
                 size_t(const TensorLayout& src, const TensorLayout& filter,
                        const TensorLayout& dst));
    MOCK_METHOD3(get_all_algorithms,
                 std::vector<Algorithm*>(const TensorLayout& p0,
                                         const TensorLayout& p1,
                                         const TensorLayout& p2));
    MOCK_METHOD5(get_algorithm_heuristic,
                 Algorithm*(const TensorLayout& p0, const TensorLayout& p1,
                            const TensorLayout& p2,
                            size_t workspace_limit_in_bytes,
                            bool reproducible));
    const char* get_algorithm_set_name() const override {
        return m_algorithm_set_name;
    }
 };

 class MockAlgorithm : public megdnn::detail::Algorithm {
    const char* m_name;

 public:
    MockAlgorithm(const char* name = "NotImportant") : m_name(name) {}
    bool is_reproducible() const override { return true; }
    const char* name() const override { return m_name; }

    virtual ~MockAlgorithm() = default;
 };

 class TestWeightPreprocess : public ::testing::Test {
 protected:
    CompNode comp_node;
    std::shared_ptr<ComputingGraph> graph;
    std::shared_ptr<HostTensorND> x_host;
    MockConvolutionForward* mock_conv_ptr;
    SymbolVar y;
    HostTensorND y_host;
    std::unique_ptr<cg::AsyncExecutable> func;

    MockConvolutionForward& mock_conv() { return *mock_conv_ptr; }

    void SetUp() override {
        constexpr uint32_t ih = 10, ic = 16, oc = 32, ph = 0, sh = 1, fh = 2,
                           iw = ih;
        comp_node = CompNode::load("cpux");
        graph = ComputingGraph::make();
        TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
        x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
        auto x = opr::Host2DeviceCopy::make(*graph, x_host);
        auto w = opr::ImmutableTensor::make(*graph, {comp_node, w_shape});
        Param param;
        param.pad_h = param.pad_w = ph;
        param.stride_h = param.stride_w = sh;
        param.format = Param::Format::NCHW;
        y = opr::ConvolutionForward::make(x, w, param);
        auto& opr =
                y.node()->owner_opr()->cast_final<opr::ConvolutionForward>();
        auto mock = std::make_unique<MockConvolutionForward>(
                opr.megdnn_opr(), ::testing::UnitTest::GetInstance()
                                          ->current_test_info()
                                          ->name());
        mock_conv_ptr = mock.get();
        ConvolutionTestingPeer{&opr}.set_megdnn_opr(std::move(mock));
        func = graph->compile({make_callback_copy(y, y_host)});
    }

    void run() { func->execute().wait(); }

    void TearDown() override {
        func.reset();
        // Triggers mock check
        graph.reset();
        x_host.reset();
    }
 };

 TEST_F(TestWeightPreprocess, NoPreprocessNeeded) {
    using ::testing::_;
    using ::testing::Return;
    auto& mock = mock_conv();

    MockAlgorithm algo;
    EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
            .WillRepeatedly(Return(&algo));
    EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
            .WillRepeatedly(Return(0));
    EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
            .WillRepeatedly(Return(0));

    {
        ::testing::InSequence seq;
        // Return empty preprocess filters, indicating no need to preprocess
        EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
                .WillRepeatedly(Return(SmallVector<TensorLayout>{}));
        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
        EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
        run();
    }
 }

 TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
    using ::testing::_;
    using ::testing::Return;
    using ::testing::Field;
    using ::testing::Invoke;
    using ::testing::Expectation;
    using PF = MockConvolutionForward::PreprocessedFilter;

    auto& mock = mock_conv();
    MockAlgorithm algo;
    SmallVector<TensorLayout> filter_layout{{{1, 2, 3, 4}, dtype::Float32()},
                                            {{5, 6, 7, 8}, dtype::Float32()}};

    EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _))
            .WillRepeatedly(Return(filter_layout));

    Expectation algo_call =
            EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
                    .WillOnce(Return(&algo));
    Expectation ws_call = EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
                                  .After(algo_call)
                                  .WillOnce(Return(0));
    Expectation pre_ws_call =
            EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
                    .After(algo_call)
                    .WillOnce(Return(233));
    {
        ::testing::InSequence seq;

        // exec_preprocess should be called only once, with workspace allocated
        int salt = 0;
        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _))
                .After(ws_call, pre_ws_call)
                .WillOnce(Invoke([&](const TensorLayout&, _megdnn_tensor_in,
                                     const TensorLayout&, PF* pf,
                                     _megdnn_workspace workspace) {
                    ASSERT_EQ(workspace.size, 233);
                    ASSERT_NE(pf, nullptr);
                    pf->algorithm_id = &salt;
                    ASSERT_EQ(pf->tensors.size(), 2);
                    ASSERT_TRUE(pf->tensors[0].layout.eq_shape({1, 2, 3, 4}));
                    ASSERT_TRUE(pf->tensors[1].layout.eq_shape({5, 6, 7, 8}));
                    ASSERT_NE(pf->tensors[0].raw_ptr, nullptr);
                    ASSERT_NE(pf->tensors[1].raw_ptr, nullptr);
                    pf->tensors[0].ptr<float>()[0] = 114.514f;
                    pf->tensors[1].ptr<float>()[0] = 1926.0817f;
                }));

        // Run the graph multiple times.
        for (int i = 0; i < 3; i++) {
            if (i > 0) {
                EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
            }
            EXPECT_CALL(mock, exec(_, _, _, _, _))
                    .WillOnce(Invoke([&](_megdnn_tensor_in, _megdnn_tensor_in,
                                         _megdnn_tensor_out, const PF* pf,
                                         _megdnn_workspace) {
                        ASSERT_NE(pf, nullptr);
                        ASSERT_EQ(pf->algorithm_id, &salt);
                        ASSERT_EQ(pf->tensors[0].ptr<float>()[0], 114.514f);
                        ASSERT_EQ(pf->tensors[1].ptr<float>()[0], 1926.0817f);
                    }));
            run();
        }
    }
 }

 }  // anonymous namespace

 >>>>>>> 11c3561ca... feat(opr): use weight preprocess feature of MegDNN
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}