feat(mgb/opr): add weight preprocess option

GitOrigin-RevId: 9d83a174fa
4 years ago · add3a1bc62
--- a/sdk/load-and-run/src/mgblar.cpp
+++ b/sdk/load-and-run/src/mgblar.cpp
@@ -194,6 +194,26 @@ R"__usage__(
    Execute operators with kernels implemented in MegDNN with CHWN4 tensor format. Can only be used
    on Nvidia GPUs, whose compute capability is above 6.1.
 )__usage__"
 R"__usage__(
  --enable-nchw44
    Execute operators with kernels implemented in MegDNN with NCHW44 tensor format. This can only
    be used on arm of armv7 and arm64, support data tyep of float32, qint8 and int8x8x16.
 )__usage__"
 R"__usage__(
  --enable-nhw88
    Execute operators with kernels implemented in MegDNN with NCHW88 tensor format. This can only
    be used on x86 with data type float.
 )__usage__"
 R"__usage__(
  --enable-nhw44-dot
    Execute operators with kernels implemented in MegDNN with NCHW44-DOT tensor format. This Can
    only be used on arm32 and arm64 with dot-product supported, and only support qint8 model
 )__usage__"
 R"__usage__(
  --weight-preprocess
    Execute operators with weight preprocess, which can optimize the operator execution time with
    algo of winograd, im2col ,etc., but it may consume more memory.
 )__usage__"
 ;
@@ -1226,6 +1246,11 @@ Args Args::from_argv(int argc, char **argv) {
            graph_opt.graph_opt.weight_winograd_transform = true;
            continue;
        }
        if (!strcmp(argv[i], "--weight-preprocess")) {
            mgb_log_warn("enable weight-preprocess optimization");
            graph_opt.graph_opt.enable_weight_preprocess();
            continue;
        }
        fprintf(stderr, "invalid arg: %s\n", argv[i]);
        ret.args_parse_ret = -1;
--- a/src/core/include/megbrain/graph/cg.h
+++ b/src/core/include/megbrain/graph/cg.h
@@ -97,6 +97,9 @@ struct GraphCommonOptimizeOptions {
    bool fuse_conv_bias_with_z = false;
    //! whether to enable fast-run profiled winograd opr replace
    bool weight_winograd_transform = false;
    //! whether to enable weight preprocess, if enabled it may use more
    //! memory, default disable now
    bool weight_preprocess = false;
    enum LayoutTransform : uint32_t {
        DEFAULT,
        NCHW4,       ///< compute using NCHW4 tensor format
@@ -127,6 +130,7 @@ struct GraphCommonOptimizeOptions {
    SET(fuse_conv_bias_nonlinearity);
    SET(fuse_conv_bias_with_z);
    SET(weight_winograd_transform);
    SET(weight_preprocess);
 #undef SET
 #define SET(_trans, _trans_capital)                                 \
    GraphCommonOptimizeOptions& enable_##_trans() {                 \
--- a/src/opr/impl/dnn/convolution.cpp
+++ b/src/opr/impl/dnn/convolution.cpp
@@ -963,6 +963,9 @@ void mixin::WeightPreprocessExecutor::record_preprocessed_weight(
 bool mixin::WeightPreprocessExecutor::mixin_allow_weight_preprocess(
        const cg::OperatorNodeBase& opr) const {
    if (!opr.owner_graph()->options().graph_opt.weight_preprocess) {
        return false;
    }
    if (!opr.input(1)->contain_flag(VarNode::Flag::PERSISTENT_DEVICE_VALUE))
        return false;
    if (cg::is_const_var_value(opr.input(1)))
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -2225,6 +2225,7 @@ protected:
                           iw = ih;
        comp_node = CompNode::load("cpux");
        graph = ComputingGraph::make();
        graph->options().graph_opt.weight_preprocess = is_weight_preprocess();
        TensorShape x_shape{1, ic, ih, iw}, w_shape{oc, ic, fh, fh};
        x_host = std::make_shared<HostTensorND>(comp_node, x_shape);
        auto x = opr::Host2DeviceCopy::make(*graph, x_host);
@@ -2247,6 +2248,8 @@ protected:
    void run() { func->execute().wait(); }
    virtual bool is_weight_preprocess() { return true; }
    void TearDown() override {
        func.reset();
        // Triggers mock check
@@ -2346,6 +2349,33 @@ TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) {
    }
 }
 class TestNoWeightPreprocess : public TestWeightPreprocess {
    bool is_weight_preprocess() override { return false; }
 };
 TEST_F(TestNoWeightPreprocess, NoPreprocess) {
    using ::testing::_;
    using ::testing::Return;
    auto& mock = mock_conv();
    MockAlgorithm algo;
    EXPECT_CALL(mock, get_algorithm_heuristic(_, _, _, _, _))
            .WillRepeatedly(Return(&algo));
    EXPECT_CALL(mock, get_workspace_in_bytes(_, _, _, _))
            .WillRepeatedly(Return(0));
    EXPECT_CALL(mock, get_preprocess_workspace_in_bytes(_, _, _))
            .WillRepeatedly(Return(0));
    {
        ::testing::InSequence seq;
        // Return empty preprocess filters, indicating no need to preprocess
        EXPECT_CALL(mock, deduce_preprocessed_filter_layout(_, _, _)).Times(0);
        EXPECT_CALL(mock, exec_preprocess(_, _, _, _, _)).Times(0);
        EXPECT_CALL(mock, exec(_, _, _, nullptr, _));
        run();
    }
 }
 }  // anonymous namespace
 #endif