fix(mgb/gopt): fix global layout transform

add a special opr_format modify function for concat operators to modify concat axis when input's layout has been changed GitOrigin-RevId: 4094208057
3 years ago · 30976c239f
--- a/dnn/src/common/convolution.cpp
+++ b/dnn/src/common/convolution.cpp
@@ -831,7 +831,7 @@ typename ConvolutionBase<Parameter>::CanonizedFilterMeta ConvolutionBase<Paramet
        dst[4] = 32;
    } else if (param().format == Param::Format::NCHW88) {
        megdnn_assert(
                src.ndim == 5 || src.ndim == 4,
                src.ndim == 5 || (src.ndim == 4 && src[1] <= 8),
                "invalid src ndim for NCHW88, expected=5 or 4, got=%zu", src.ndim);
        dst.ndim = 5;
        dst[0] = src[0];
@@ -854,7 +854,7 @@ typename ConvolutionBase<Parameter>::CanonizedFilterMeta ConvolutionBase<Paramet
            param().format == Param::Format::NCHW44 ||
            param().format == Param::Format::NCHW44_DOT) {
        megdnn_assert(
                src.ndim == 5 || src.ndim == 4,
                src.ndim == 5 || (src.ndim == 4 && src[1] <= 4),
                "invalid src ndim for NCHW44, expected=5 or 4, got=%zu", src.ndim);
        dst.ndim = 5;
        dst[0] = src[0];
--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -840,7 +840,9 @@ const GraphOptimizer& GraphOptimizer::add_passes_for_graph_tuning_options(

    if (need_param_fuse) {
        add_pass<ParamFusePass>();
        add_pass<ParamMergePass>();
    }

    return *this;
 }

--- a/src/gopt/impl/global_layout_transform/layout_transform_context.cpp
+++ b/src/gopt/impl/global_layout_transform/layout_transform_context.cpp
@@ -66,7 +66,6 @@ std::unique_ptr<LayoutTransformContext> make_cuda_ctx(
    ctx->add_opr_config(
               opr::ConvBiasForward::typeinfo(),
               {OprFormatConfigID::NCHW, OprFormatConfigID::NHWC,
                OprFormatConfigID::NCHW4_NCHW32, OprFormatConfigID::NCHW32_NCHW4,
                OprFormatConfigID::NCHW4, OprFormatConfigID::NCHW32,
                OprFormatConfigID::NCHW64, OprFormatConfigID::CHWN4})
            .add_opr_config(
--- a/src/gopt/impl/global_layout_transform/layout_transform_pass.cpp
+++ b/src/gopt/impl/global_layout_transform/layout_transform_pass.cpp
@@ -18,6 +18,7 @@
 #include "megbrain/gopt/solver.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/serialization/sereg.h"

 #include "megbrain/utils/hash_ct.h"
@@ -64,11 +65,6 @@ void LayoutTransformPass::apply(OptState& opt) const {
    auto&& base_cfg_id = m_ctx->attribute().base_config_id;
    auto&& reformat_attribute = m_ctx->attribute().reformat_attribute;
    ThinHashMap<VarNode*, TensorFormats> var2fmts;
    static ThinHashSet<Typeinfo*> format_aware_oprs = {
 #define cb(_Opr) opr::_Opr::typeinfo(),
            FOREACH_FORMAT_AWARE_OPR(cb)
 #undef cb
    };
    auto rewriter = opt.graph().make_rewriter();
    auto on_opr = [&opr_configs, &base_fmt, &base_cfg_id, &reformat_attribute,
                   &rewriter, &solution, &var2fmts,
@@ -141,8 +137,12 @@ void LayoutTransformPass::apply(OptState& opt) const {
                new_inp[i] = new_var;
            }
            VarNode* new_out;
            if (format_aware_oprs.count(opr->dyn_typeinfo()) > 0) {
                new_out = intl::modify_opr_format(opr_fmt.val(), new_inp, opr);
            if (intl::has_opr_format_modifier(opr)) {
                intl::OprFormatInfo opr_format_info;
                opr_format_info.opr_format = opr_fmt.val();
                opr_format_info.tensor_formats = {
                        base_fmt, opr_format_to_tensor_formats(opr_fmt.val())};
                new_out = intl::modify_opr_format(opr_format_info, new_inp, opr);
            } else {
                new_out = serialization::copy_opr_shallow(*opr, new_inp, opr->config())
                                  ->output(0);
--- a/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
+++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.cpp
@@ -11,10 +11,12 @@
 */

 #include "./opr_format_modifier.h"
 #include "./utils.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/serialization/sereg.h"

 #include "midout.h"
@@ -201,6 +203,37 @@ INST(ConvolutionBackwardData)
 INST(PoolingForward)
 #undef APPLY
 #undef INST

 VarNode* modify_concat_opr_format(
        gopt::intl::OprFormatInfo::TensorFormatsInfo tensor_formats,
        const VarNodeArray& i, const cg::OperatorNodeBase* opr) {
    auto base_format = tensor_formats.from;
    auto tensor_format = tensor_formats.to;
    int axis = opr->cast_final_safe<Concat>().axis();
    /// modify axis
    using Dimension = megdnn::Dimension;
    static constexpr uint32_t UNDETERMINED_EXTENT = Dimension::UNDETERMINED_EXTENT;
    auto orig_shape = tensor_formats_to_named_tensor_shape(base_format);
    auto target_shape = tensor_formats_to_named_tensor_shape(tensor_format);
    mgb_assert(
            static_cast<size_t>(axis) < orig_shape.ndim,
            "invalid axis of concat opr(axis:%d,shp:%s)", axis,
            orig_shape.to_string().c_str());
    if (orig_shape[axis].extent() != UNDETERMINED_EXTENT)
        return nullptr;
    auto axis_name = orig_shape[axis].name();
    int new_axis = target_shape.ndim;
    for (size_t i = 0; i < target_shape.ndim; ++i) {
        if (target_shape[i].name() == axis_name &&
            target_shape[i].extent() == UNDETERMINED_EXTENT) {
            new_axis = i;
            break;
        }
    }
    if (static_cast<size_t>(new_axis) >= target_shape.ndim)
        return nullptr;
    return opr::Concat::make(i, new_axis, opr->config()).node();
 }
 }  // namespace

 namespace mgb {
@@ -275,13 +308,16 @@ INST(Resize, 2);
 #undef INST

 VarNode* modify_opr_format(
        opr::ConvBias::Param::Format opr_format, const VarNodeArray& i,
        OprFormatInfo opr_format_info, const VarNodeArray& i,
        const cg::OperatorNodeBase* opr) {
 #define cb(_Opr)                                                  \
    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                \
        return OprFormatModifier<_Opr>::make(opr_format, i, opr); \
 #define cb(_Opr)                                                                  \
    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                                \
        return OprFormatModifier<_Opr>::make(opr_format_info.opr_format, i, opr); \
    } else
    FOREACH_FORMAT_AWARE_OPR(cb) {
    FOREACH_FORMAT_AWARE_OPR(cb)
    if (opr->dyn_typeinfo() == opr::Concat::typeinfo()) {
        return modify_concat_opr_format(opr_format_info.tensor_formats, i, opr);
    } else {
        mgb_throw(
                InternalError, "invalid format aware operator(got:%s)",
                opr->dyn_typeinfo()->name);
@@ -302,6 +338,28 @@ bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr)
                InternalError, "invalid multi-algo operator(got:%s)",
                opr->dyn_typeinfo()->name);
    }
 #undef cb
 }

 bool has_opr_format(const cg::OperatorNodeBase* opr) {
    bool ret = false;
 #define cb(_Opr) ret |= opr->dyn_typeinfo() == _Opr::typeinfo();
    FOREACH_FORMAT_AWARE_OPR(cb)
 #undef cb
    return ret;
 }

 bool has_opr_format_modifier(const cg::OperatorNodeBase* opr) {
    bool ret = false;
 #define cb(_Opr) ret |= opr->dyn_typeinfo() == _Opr::typeinfo();
    FOREACH_MODIFY_OPR_FORMAT_OPR(cb)
 #undef cb
    return ret;
 }

 bool allow_aligned_layout(const cg::OperatorNodeBase* opr) {
    return opr->dyn_typeinfo() != opr::Concat::typeinfo() &&
           opr->dyn_typeinfo() != opr::Reduce::typeinfo();
 }

 }  // namespace intl
--- a/src/gopt/impl/global_layout_transform/opr_format_modifier.h
+++ b/src/gopt/impl/global_layout_transform/opr_format_modifier.h
@@ -16,17 +16,36 @@

 namespace mgb {
 namespace gopt {
 enum class TensorFormats : uint32_t;
 namespace intl {

 #define FOREACH_FORMAT_AWARE_OPR(cb)                                                   \
    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) cb(PoolingForward) \
            cb(WarpPerspective) cb(Resize)

 #define FOREACH_MODIFY_OPR_FORMAT_OPR(cb) FOREACH_FORMAT_AWARE_OPR(cb) cb(Concat)

 bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);

 struct OprFormatInfo {
    opr::Convolution::Param::Format opr_format;
    struct TensorFormatsInfo {
        TensorFormats from;
        TensorFormats to;
    };
    TensorFormatsInfo tensor_formats;
 };

 VarNode* modify_opr_format(
        opr::Convolution::Param::Format opr_format, const VarNodeArray& i,
        OprFormatInfo opr_format, const VarNodeArray& i,
        const cg::OperatorNodeBase* opr);

 bool has_opr_format(const cg::OperatorNodeBase* opr);

 bool has_opr_format_modifier(const cg::OperatorNodeBase* opr);

 bool allow_aligned_layout(const cg::OperatorNodeBase* opr);

 }  // namespace intl
 }  // namespace gopt
 }  // namespace mgb
--- a/src/gopt/impl/global_layout_transform/opr_tensor_formats_config.cpp
+++ b/src/gopt/impl/global_layout_transform/opr_tensor_formats_config.cpp
@@ -625,7 +625,13 @@ struct ConvTensorFormatsDispatcherImpl<Opr, OprFormatConfigID::NCHW44_HYBRID> {
            TensorType tensor_type = i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= check_dtype(opr->output(0)->dtype(), false);
        // FIXME: hack for nchw nchw44 hybrid mode
        static_assert(
                std::is_same<Opr, opr::ConvolutionForward>::value ||
                        std::is_same<Opr, opr::ConvBiasForward>::value,
                "nchw44 hybrid only support conv or conv_bias opr");
        size_t in_channel = opr->input(0)->shape()[1];
        available &= in_channel <= 4_z && check_dtype(opr->output(0)->dtype(), false);
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        config.input_tensor_formats = {
@@ -696,7 +702,14 @@ struct ConvTensorFormatsDispatcherImpl<Opr, OprFormatConfigID::NCHW88_HYBRID> {
            TensorType tensor_type = i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::Float32;
        // FIXME: hack for nchw nchw88 hybrid mode
        static_assert(
                std::is_same<Opr, opr::ConvolutionForward>::value ||
                        std::is_same<Opr, opr::ConvBiasForward>::value,
                "nchw nchw88 hybrid mode only support conv or conv_bias opr");
        size_t in_channel = opr->input(0)->shape()[1];
        available &= in_channel <= 8_z &&
                     opr->output(0)->dtype().enumv() == DTypeEnum::Float32;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        // setup tensor formats
@@ -783,6 +796,13 @@ struct ConvTensorFormatsDispatcherImpl<Opr, OprFormatConfigID::NCHW44_DOT_HYBRID
                     opr->output(0)->dtype().enumv() == DTypeEnum::Quantized8Asymm;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        // FIXME: hack for nchw nchw44 dot hybrid mode
        static_assert(
                std::is_same<Opr, opr::ConvolutionForward>::value ||
                        std::is_same<Opr, opr::ConvBiasForward>::value,
                "nchw44 dot hybrid only support conv or conv_bias opr");
        size_t in_channel = opr->input(0)->shape()[1];
        available &= in_channel <= 4_z;
        // setup tensor formats
        config.input_tensor_formats = {
                TensorFormats::NCHW, TensorFormats::KRSCk4, TensorFormats::NCHWc4,
@@ -940,6 +960,8 @@ StaticData::StaticData() {
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32_NCHW4);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4_NCHW32);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW44);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW88);
--- a/src/gopt/impl/global_layout_transform/profiler_impl.cpp
+++ b/src/gopt/impl/global_layout_transform/profiler_impl.cpp
@@ -19,6 +19,7 @@
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/plugin/base.h"
 #include "megbrain/serialization/sereg.h"

@@ -202,20 +203,43 @@ float ProfilerImpl::profile_operator(
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    graph->options().var_sanity_check_first_run = false;
    OperatorNodeBase* new_opr;
    /// \note: Concat operators are specially treated. The reasons are as
    /// follows:
    /// 1. Padding the input varnodes of the
    /// Concat opr is not allowed. If we pad the input varnodes of Concat opr, the
    /// padding information of output varnode should be propagated in the layout
    /// selection algorithm. But this feature has not been implemented now.
    /// 2. The axis of concat operator should be modified, because the layouts of the
    /// input varnodes has been modified. So we handle the new axis in the OprMaker
    /// function.
    bool allow_aligned = intl::allow_aligned_layout(opr);
    VarNodeArray new_inps(opr->input().size());
    for (size_t i = 0; i < opr->input().size(); ++i) {
        auto&& var = opr->input(i);
        auto&& cn = var->comp_node();
        auto&& dtype = var->dtype();
        auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
        auto aligned_tensor_shape = ReformatManager::make_aligned_tensor_shape(
                var, base_format, tensor_format, extra_attribute);
        dval->resize(aligned_tensor_shape);
        auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
        new_inps[i] = aligned_var.node();
        auto new_shape = ReformatManager::try_make_tensor_shape(
                var, base_format, tensor_format, extra_attribute, allow_aligned);
        if (new_shape.ndim == 0)
            return PROFILE_TIME_OUT;
        dval->resize(new_shape);
        auto new_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
        new_inps[i] = new_var.node();
    }
    if (intl::has_opr_format_modifier(opr)) {
        intl::OprFormatInfo opr_format_info;
        opr_format_info.tensor_formats = {base_format, tensor_format};
        auto new_var = intl::modify_opr_format(opr_format_info, new_inps, opr);
        if (new_var)
            new_opr = new_var->owner_opr();
        else
            return PROFILE_TIME_OUT;
    } else {
        new_opr = serialization::copy_opr_shallow(
                *opr, new_inps, opr->config(), {graph.get()});
    }
    auto new_opr = serialization::copy_opr_shallow(
            *opr, new_inps, opr->config(), {graph.get()});
    if (!m_opr_filter(opr, new_opr))
        return PROFILE_TIME_OUT;
    auto y = new_opr->output(0);
@@ -248,6 +272,8 @@ float ProfilerImpl::profile_operator(
        ReformatAttribute extra_attribute) const {
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    graph->options().graph_opt.weight_preprocess =
            opr->owner_graph()->options().graph_opt.weight_preprocess;
    graph->options().var_sanity_check_first_run = false;
    VarNodeArray new_inps(opr->input().size());
    size_t i = 0;
@@ -274,8 +300,11 @@ float ProfilerImpl::profile_operator(
                    config.input_tensor_formats[i], extra_attribute);
        }
        dval->resize(aligned_shape);
        auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
        new_inps[i] = aligned_var.node();
        if (config.input_tensor_types[i] == TensorType::WEIGHT) {
            new_inps[i] = opr::SharedDeviceTensor::make_const(*graph, dval).node();
        } else {
            new_inps[i] = opr::VolatileSharedDeviceTensor::make(*graph, dval).node();
        }
    }
    for (; i < opr->input().size(); ++i) {
        auto&& var = opr->input(i);
@@ -291,7 +320,9 @@ float ProfilerImpl::profile_operator(
        auto imm = opr::ImmutableTensor::make(*graph, *hval);
        new_inps[i] = imm.node();
    }
    VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps, opr);
    intl::OprFormatInfo opr_format_info;
    opr_format_info.opr_format = config.opr_format;
    VarNode* y = mgb::gopt::intl::modify_opr_format(opr_format_info, new_inps, opr);
    static const ThinHashSet<Typeinfo*> multi_algo_oprs = {
            opr::Convolution::typeinfo(),
            opr::ConvBiasForward::typeinfo(),
--- a/src/gopt/impl/global_layout_transform/reformat_manager.cpp
+++ b/src/gopt/impl/global_layout_transform/reformat_manager.cpp
@@ -587,9 +587,9 @@ const ReformatManager& ReformatManager::instance() {
    return inst;
 }

 TensorShape ReformatManager::make_aligned_tensor_shape(
 TensorShape ReformatManager::try_make_tensor_shape(
        const VarNode* var, TensorFormats orig_formats, TensorFormats target_formats,
        ReformatKey::Attribute extra_attribute) {
        ReformatKey::Attribute extra_attribute, bool allow_aligned) {
    using Dimension = megdnn::Dimension;
    static constexpr uint32_t UNDETERMINED_EXTENT = Dimension::UNDETERMINED_EXTENT;
    auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats);
@@ -623,8 +623,17 @@ TensorShape ReformatManager::make_aligned_tensor_shape(
                                : (orig_shape[idx] / target_shape[i]).extent();
            if (mul)
                tshp[i] = oshp[idx] * factor;
            else
                tshp[i] = divup(oshp[idx], factor);
            else {
                if (allow_aligned)
                    tshp[i] = divup(oshp[idx], factor);
                else if (!(oshp[idx] % factor)) {
                    tshp[i] = oshp[idx] / factor;
                } else {
                    return TensorShape{};
                }
            }

            /// hack for nhwc auto padding
            if (name == Dimension::Name::C) {
                size_t channel_alignment = target_shape[i].stride();
                size_t channels = tshp[i] * channel_alignment;
@@ -641,6 +650,15 @@ TensorShape ReformatManager::make_aligned_tensor_shape(
    return tshp;
 }

 TensorShape ReformatManager::make_aligned_tensor_shape(
        const VarNode* var, TensorFormats orig_formats, TensorFormats target_formats,
        ReformatKey::Attribute extra_attribute) {
    auto tshp = ReformatManager::try_make_tensor_shape(
            var, orig_formats, target_formats, extra_attribute);
    mgb_assert(tshp.ndim);
    return tshp;
 }

 TensorShape ReformatManager::make_aligned_weight_shape(
        const VarNode* var, TensorFormats orig_formats, TensorFormats target_formats,
        TensorFormats extra_formats, ReformatKey::Attribute extra_attribute) {
--- a/src/gopt/impl/global_layout_transform/utils.h
+++ b/src/gopt/impl/global_layout_transform/utils.h
@@ -93,6 +93,8 @@ static inline TensorFormats opr_format_to_tensor_formats(
            return TensorFormats::NCHWc4;
        case OprFormat::NCHW8:
            return TensorFormats::NCHWc8;
        case OprFormat::NCHW44_DOT:
            return TensorFormats::NCHWc4;
        default:
            mgb_throw(
                    AssertionError, "format(%s) is not supported",
@@ -171,7 +173,6 @@ static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
                    static_cast<uint32_t>(format));
    }
 }

 }  // namespace gopt
 }  // namespace mgb

--- a/src/gopt/include/megbrain/gopt/reformat_manager.h
+++ b/src/gopt/include/megbrain/gopt/reformat_manager.h
@@ -135,6 +135,13 @@ public:
            const VarNode* orig_var, const ReformatKey& key,
            const AlignmentDesc& extra_alignment = {}) const;

    /// return empty shape, if shape of origin varnode does not satisfy the alignment
    /// requirement of the target tensor formats
    static TensorShape try_make_tensor_shape(
            const VarNode* var, TensorFormats orig_formats,
            TensorFormats target_formats,
            ReformatKey::Attribute extra_attribute = ReformatKey::Attribute::DEFAULT,
            bool allow_aligned = true);
    static TensorShape make_aligned_tensor_shape(
            const VarNode* var, TensorFormats orig_formats,
            TensorFormats target_formats,
--- a/src/gopt/test/cache_data.h
+++ b/src/gopt/test/cache_data.h
@@ -18104,3 +18104,217 @@ static const std::vector<uint8_t> TestLayoutTransform_Wide = {
        108, 111, 97,  116, 51,  50,  124, 49,  54,  44,  49,  54,  44,  56,  44,  56,
        59,  70,  108, 111, 97,  116, 51,  50,  124, 49,  48,  4,   0,   0,   0,   233,
        38,  177, 64};

 static const std::vector<uint8_t> TestLayoutTransform_Concat = {
        1,   0,   0,   0,   34,  0,   0,   0,   108, 97,  121, 111, 117, 116, 95,  116,
        114, 97,  110, 115, 102, 111, 114, 109, 95,  112, 114, 111, 102, 105, 108, 101,
        58,  112, 108, 97,  116, 61,  99,  117, 100, 97,  33,  0,   0,   0,   77,  0,
        0,   0,   49,  54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,  67,  47,
        47,  52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  123, 78,  44,  67,
        44,  72,  44,  87,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  4,
        0,   0,   0,   28,  90,  20,  67,  89,  0,   0,   0,   47,  225, 79,  220, 129,
        196, 236, 15,  1,   0,   0,   0,   49,  54,  44,  50,  44,  51,  54,  56,  44,
        54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124,
        49,  54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  54,  44,  52,  44,  51,  54,
        56,  44,  54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,
        56,  124, 51,  48,  4,   0,   0,   0,   128, 150, 24,  75,  77,  0,   0,   0,
        49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,  67,  47,  47,  52,
        44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  123, 78,  44,  67,  44,  72,
        44,  87,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,
        56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  4,   0,   0,
        0,   236, 81,  208, 66,  52,  0,   0,   0,   145, 22,  69,  252, 191, 174, 0,
        1,   49,  54,  44,  51,  44,  55,  51,  54,  44,  49,  50,  56,  48,  59,  85,
        105, 110, 116, 56,  124, 49,  54,  44,  51,  44,  55,  51,  54,  44,  49,  50,
        56,  48,  59,  70,  108, 111, 97,  116, 51,  50,  124, 48,  48,  4,   0,   0,
        0,   88,  57,  79,  68,  66,  0,   0,   0,   49,  54,  44,  51,  44,  55,  51,
        54,  44,  49,  50,  56,  48,  59,  70,  108, 111, 97,  116, 51,  50,  124, 123,
        78,  44,  67,  44,  72,  44,  87,  125, 59,  123, 78,  44,  67,  47,  47,  52,
        44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  48,  59,  70,  108, 111, 97,
        116, 51,  50,  59,  70,  108, 111, 97,  116, 51,  50,  4,   0,   0,   0,   134,
        75,  130, 69,  163, 0,   0,   0,   10,  151, 159, 190, 73,  14,  145, 194, 1,
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
        0,   0,   0,   1,   0,   0,   0,   2,   0,   0,   0,   2,   0,   0,   0,   1,
        0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   2,   0,   0,   0,   0,
        0,   0,   0,   255, 255, 255, 255, 255, 255, 255, 255, 49,  54,  44,  56,  44,
        49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  124, 49,  54,  44,  56,  44,  51,  44,  51,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  44,  49,  54,  44,  49,  44,
        49,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  51,  50,  124, 49,
        54,  44,  49,  54,  44,  57,  50,  44,  49,  54,  48,  59,  81,  117, 97,  110,
        116, 105, 122, 101, 100, 83,  56,  124, 51,  48,  4,   0,   0,   0,   68,  139,
        104, 67,  60,  0,   0,   0,   49,  54,  44,  51,  44,  55,  51,  54,  44,  49,
        50,  56,  48,  59,  85,  105, 110, 116, 56,  124, 123, 78,  44,  67,  47,  47,
        52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  123, 78,  44,  67,  44,
        72,  44,  87,  125, 59,  48,  59,  85,  105, 110, 116, 56,  59,  85,  105, 110,
        116, 56,  4,   0,   0,   0,   201, 161, 135, 68,  64,  0,   0,   0,   218, 182,
        120, 146, 221, 155, 179, 235, 11,  0,   0,   0,   49,  54,  44,  52,  44,  49,
        56,  52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100,
        83,  56,  124, 49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,
        81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,  4,   0,
        0,   0,   182, 243, 29,  66,  161, 0,   0,   0,   10,  151, 159, 190, 73,  14,
        145, 194, 1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        0,   0,   1,   0,   0,   0,   1,   0,   0,   0,   2,   0,   0,   0,   2,   0,
        0,   0,   1,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   2,   0,
        0,   0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255, 255, 255, 49,  54,
        44,  52,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110, 116,
        105, 122, 101, 100, 83,  56,  124, 52,  44,  52,  44,  51,  44,  51,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  44,  52,  44,  49,
        44,  49,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  51,  50,  124,
        49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,  4,   0,   0,   0,   128,
        150, 24,  75,  60,  0,   0,   0,   49,  54,  44,  51,  44,  55,  51,  54,  44,
        49,  50,  56,  48,  59,  85,  105, 110, 116, 56,  124, 123, 78,  44,  67,  44,
        72,  44,  87,  125, 59,  123, 78,  44,  67,  47,  47,  52,  44,  72,  44,  87,
        44,  67,  37,  52,  125, 59,  48,  59,  85,  105, 110, 116, 56,  59,  85,  105,
        110, 116, 56,  4,   0,   0,   0,   203, 33,  219, 68,  68,  0,   0,   0,   218,
        182, 120, 146, 221, 155, 179, 235, 16,  0,   0,   0,   49,  54,  44,  51,  44,
        55,  51,  54,  44,  49,  50,  56,  48,  59,  70,  108, 111, 97,  116, 51,  50,
        124, 49,  59,  70,  108, 111, 97,  116, 51,  50,  124, 49,  54,  44,  51,  44,
        55,  51,  54,  44,  49,  50,  56,  48,  59,  70,  108, 111, 97,  116, 51,  50,
        124, 48,  48,  4,   0,   0,   0,   205, 204, 161, 68,  77,  0,   0,   0,   49,
        54,  44,  52,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110,
        116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,  67,  47,  47,  52,  44,
        72,  44,  87,  44,  67,  37,  52,  125, 59,  123, 78,  44,  67,  44,  72,  44,
        87,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,
        59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  4,   0,   0,   0,
        200, 118, 174, 67,  163, 0,   0,   0,   10,  151, 159, 190, 73,  14,  145, 194,
        1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        1,   0,   0,   0,   1,   0,   0,   0,   2,   0,   0,   0,   2,   0,   0,   0,
        1,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   2,   0,   0,   0,
        0,   0,   0,   0,   255, 255, 255, 255, 255, 255, 255, 255, 49,  54,  44,  56,
        44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122,
        101, 100, 83,  56,  124, 49,  54,  44,  56,  44,  51,  44,  51,  59,  81,  117,
        97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  44,  49,  54,  44,  49,
        44,  49,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  51,  50,  124,
        49,  54,  44,  49,  54,  44,  57,  50,  44,  49,  54,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,  4,   0,   0,   0,   128,
        150, 24,  75,  161, 0,   0,   0,   10,  151, 159, 190, 73,  14,  145, 194, 1,
        0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   1,
        0,   0,   0,   1,   0,   0,   0,   2,   0,   0,   0,   2,   0,   0,   0,   1,
        0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   2,   0,   0,   0,   0,
        0,   0,   0,   255, 255, 255, 255, 255, 255, 255, 255, 49,  54,  44,  52,  44,
        51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  124, 52,  44,  52,  44,  51,  44,  51,  59,  81,  117, 97,  110,
        116, 105, 122, 101, 100, 83,  56,  124, 49,  44,  52,  44,  49,  44,  49,  59,
        81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  51,  50,  124, 49,  54,  44,
        52,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105,
        122, 101, 100, 83,  56,  124, 51,  48,  4,   0,   0,   0,   138, 65,  164, 67,
        77,  0,   0,   0,   49,  54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,
        59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,
        67,  44,  72,  44,  87,  125, 59,  123, 78,  44,  67,  47,  47,  52,  44,  72,
        44,  87,  44,  67,  37,  52,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105,
        122, 101, 100, 83,  56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,
        56,  4,   0,   0,   0,   137, 65,  190, 67,  58,  0,   0,   0,   145, 22,  69,
        252, 191, 174, 0,   1,   49,  54,  44,  51,  44,  55,  51,  54,  44,  49,  50,
        56,  48,  59,  70,  108, 111, 97,  116, 51,  50,  124, 49,  54,  44,  51,  44,
        55,  51,  54,  44,  49,  50,  56,  48,  59,  81,  117, 97,  110, 116, 105, 122,
        101, 100, 83,  56,  124, 48,  48,  4,   0,   0,   0,   38,  177, 146, 68,  89,
        0,   0,   0,   47,  225, 79,  220, 129, 196, 236, 15,  1,   0,   0,   0,   49,
        54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110,
        116, 105, 122, 101, 100, 83,  56,  124, 49,  54,  44,  50,  44,  51,  54,  56,
        44,  54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,
        124, 49,  54,  44,  52,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,  117,
        97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,  4,   0,   0,   0,
        150, 67,  235, 66,  162, 0,   0,   0,   10,  151, 159, 190, 73,  14,  145, 194,
        1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        1,   0,   0,   0,   1,   0,   0,   0,   2,   0,   0,   0,   2,   0,   0,   0,
        1,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   2,   0,   0,   0,
        0,   0,   0,   0,   255, 255, 255, 255, 255, 255, 255, 255, 49,  54,  44,  51,
        44,  55,  51,  54,  44,  49,  50,  56,  48,  59,  81,  117, 97,  110, 116, 105,
        122, 101, 100, 83,  56,  124, 50,  44,  51,  44,  51,  44,  51,  59,  81,  117,
        97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  44,  50,  44,  49,  44,
        49,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  51,  50,  124, 49,
        54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110,
        116, 105, 122, 101, 100, 83,  56,  124, 51,  48,  4,   0,   0,   0,   128, 150,
        24,  75,  64,  0,   0,   0,   218, 182, 120, 146, 221, 155, 179, 235, 11,  0,
        0,   0,   49,  54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  54,  44,  50,  44,
        51,  54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  124, 51,  48,  4,   0,   0,   0,   227, 165, 243, 66,  77,  0,
        0,   0,   49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,  67,  44,
        72,  44,  87,  125, 59,  123, 78,  44,  67,  47,  47,  52,  44,  72,  44,  87,
        44,  67,  37,  52,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  4,
        0,   0,   0,   174, 71,  169, 66,  162, 0,   0,   0,   10,  151, 159, 190, 73,
        14,  145, 194, 1,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        0,   0,   0,   1,   0,   0,   0,   1,   0,   0,   0,   2,   0,   0,   0,   2,
        0,   0,   0,   1,   0,   0,   0,   1,   0,   0,   0,   0,   0,   0,   0,   2,
        0,   0,   0,   0,   0,   0,   0,   255, 255, 255, 255, 255, 255, 255, 255, 49,
        54,  44,  51,  44,  55,  51,  54,  44,  49,  50,  56,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 50,  44,  51,  44,  51,  44,  51,
        59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  44,  50,
        44,  49,  44,  49,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  51,
        50,  124, 49,  54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,  4,   0,   0,
        0,   128, 150, 24,  75,  66,  0,   0,   0,   49,  54,  44,  51,  44,  55,  51,
        54,  44,  49,  50,  56,  48,  59,  70,  108, 111, 97,  116, 51,  50,  124, 123,
        78,  44,  67,  47,  47,  52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,
        123, 78,  44,  67,  44,  72,  44,  87,  125, 59,  48,  59,  70,  108, 111, 97,
        116, 51,  50,  59,  70,  108, 111, 97,  116, 51,  50,  4,   0,   0,   0,   196,
        32,  63,  69,  89,  0,   0,   0,   47,  225, 79,  220, 129, 196, 236, 15,  1,
        0,   0,   0,   49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,
        81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  54,  44,  52,
        44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122,
        101, 100, 83,  56,  124, 49,  54,  44,  56,  44,  49,  56,  52,  44,  51,  50,
        48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,
        4,   0,   0,   0,   156, 196, 136, 66,  77,  0,   0,   0,   49,  54,  44,  49,
        54,  44,  57,  50,  44,  49,  54,  48,  59,  81,  117, 97,  110, 116, 105, 122,
        101, 100, 83,  56,  124, 123, 78,  44,  67,  47,  47,  52,  44,  72,  44,  87,
        44,  67,  37,  52,  125, 59,  123, 78,  44,  67,  44,  72,  44,  87,  125, 59,
        48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  59,  81,  117,
        97,  110, 116, 105, 122, 101, 100, 83,  56,  4,   0,   0,   0,   207, 247, 211,
        66,  77,  0,   0,   0,   49,  54,  44,  56,  44,  49,  56,  52,  44,  51,  50,
        48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,
        44,  67,  47,  47,  52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  123,
        78,  44,  67,  44,  72,  44,  87,  125, 59,  48,  59,  81,  117, 97,  110, 116,
        105, 122, 101, 100, 83,  56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100,
        83,  56,  4,   0,   0,   0,   54,  94,  58,  67,  64,  0,   0,   0,   218, 182,
        120, 146, 221, 155, 179, 235, 11,  0,   0,   0,   49,  54,  44,  50,  44,  51,
        54,  56,  44,  54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100,
        83,  56,  124, 49,  54,  44,  50,  44,  51,  54,  56,  44,  54,  52,  48,  59,
        81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 48,  48,  4,   0,
        0,   0,   155, 196, 128, 66,  78,  0,   0,   0,   49,  54,  44,  51,  44,  55,
        51,  54,  44,  49,  50,  56,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  124, 123, 78,  44,  67,  44,  72,  44,  87,  125, 59,  123, 78,
        44,  67,  47,  47,  52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  48,
        59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  4,   0,   0,   0,   191, 74,  159, 68,
        89,  0,   0,   0,   47,  225, 79,  220, 129, 196, 236, 15,  1,   0,   0,   0,
        49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  54,  44,  52,  44,  49,  56,
        52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,
        56,  124, 49,  54,  44,  56,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 51,  48,  4,   0,   0,
        0,   170, 241, 138, 66,  77,  0,   0,   0,   49,  54,  44,  52,  44,  51,  54,
        56,  44,  54,  52,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,
        56,  124, 123, 78,  44,  67,  44,  72,  44,  87,  125, 59,  123, 78,  44,  67,
        47,  47,  52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  59,  81,  117, 97,  110, 116,
        105, 122, 101, 100, 83,  56,  4,   0,   0,   0,   79,  141, 147, 67,  77,  0,
        0,   0,   49,  54,  44,  49,  54,  44,  57,  50,  44,  49,  54,  48,  59,  81,
        117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,  67,  44,
        72,  44,  87,  125, 59,  123, 78,  44,  67,  47,  47,  52,  44,  72,  44,  87,
        44,  67,  37,  52,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  4,
        0,   0,   0,   170, 241, 170, 66,  77,  0,   0,   0,   49,  54,  44,  56,  44,
        49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101,
        100, 83,  56,  124, 123, 78,  44,  67,  44,  72,  44,  87,  125, 59,  123, 78,
        44,  67,  47,  47,  52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  48,
        59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  4,   0,   0,   0,   203, 161, 25,  67,
        64,  0,   0,   0,   218, 182, 120, 146, 221, 155, 179, 235, 11,  0,   0,   0,
        49,  54,  44,  52,  44,  49,  56,  52,  44,  51,  50,  48,  59,  81,  117, 97,
        110, 116, 105, 122, 101, 100, 83,  56,  124, 49,  54,  44,  52,  44,  49,  56,
        52,  44,  51,  50,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,
        56,  124, 51,  48,  4,   0,   0,   0,   102, 102, 22,  66,  78,  0,   0,   0,
        49,  54,  44,  51,  44,  55,  51,  54,  44,  49,  50,  56,  48,  59,  81,  117,
        97,  110, 116, 105, 122, 101, 100, 83,  56,  124, 123, 78,  44,  67,  47,  47,
        52,  44,  72,  44,  87,  44,  67,  37,  52,  125, 59,  123, 78,  44,  67,  44,
        72,  44,  87,  125, 59,  48,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100,
        83,  56,  59,  81,  117, 97,  110, 116, 105, 122, 101, 100, 83,  56,  4,   0,
        0,   0,   54,  94,  72,  68};
--- a/src/gopt/test/embed_cache.py
+++ b/src/gopt/test/embed_cache.py
@@ -20,7 +20,7 @@
 # 2. 编译megbrain_test，并运行所有全局图优化相关测试：
 #    ./megbrain_test --gtest_filter="*LayoutTransform*"
 # 3. 用这个脚本把所有的cache文件打包在一起
 #    python3 embed_cache.py -o cache_data.h -r $(ls /path/to/cache/*.cache)
 #    python3 embed_cache.py -o cache_data.h -r -a $(ls /path/to/cache/*.cache)
 # 4. 将步骤1中的 define 语句改回原样，这样 profile 过程就会使用 cache 下来的数据。
 # 5. 最后可以重新构建一下 megbrain_test ，确保测试结果正确。
 import os.path
@@ -44,9 +44,10 @@ def _u32(data):
 class CacheDataGenerator:
    _cache_files = None

    def __init__(self, cache_files, remove_plat_info = True):
    def __init__(self, cache_files, remove_plat_info=True, append_cache=True):
        self._cache_files = cache_files
        self._remove_plat_info = remove_plat_info
        self._append_cache = append_cache

    def _get_hash(self):
        return _u32(self._hash.digest()[:4])
@@ -71,9 +72,10 @@ class CacheDataGenerator:
        return ','.join(ret)

    def gen_cache_data_header(self, fout, src_map):
        fout.write('// generated embed_cache.py\n')
        fout.write('#include <vector>\n')
        fout.write('#include <stdint.h>\n')
        if not self._append_cache:
            fout.write('// generated embed_cache.py\n')
            fout.write('#include <vector>\n')
            fout.write('#include <stdint.h>\n')
        for k, v in sorted(src_map.items()):
            fout.write("""
 static const std::vector<uint8_t> {} = {{
@@ -89,7 +91,11 @@ static const std::vector<uint8_t> {} = {{
            assert ext == ".cache", "ext: {}, fname {}".format(ext, fname)
            assert base not in fname2cache_data, "duplicated kernel: " + base
            fname2cache_data[base] = self.gen_cache_data(fname)
        with open(output, 'w') as fout:
        if self._append_cache:
            mode = 'a'
        else:
            mode = 'w'
        with open(output, mode) as fout:
            self.gen_cache_data_header(fout, fname2cache_data)
        logger.info('done')

@@ -107,7 +113,15 @@ if __name__ == '__main__':
        default=True,
        help="whether remove platform infomation in the cache (default: True)"
    )
    parser.add_argument(
        "-a",
        "--append-cache",
        action='store_true',
        default=True,
        help="whether append the cache (default: True)"
    )
    parser.add_argument('cache', help='cache files to be embedded', nargs='+')
    args = parser.parse_args()
    cache_generator = CacheDataGenerator(args.cache, args.remove_plat_info)
    cache_generator = CacheDataGenerator(args.cache, args.remove_plat_info, 
                                         args.append_cache)
    cache_generator.invoke(args.output)
--- a/src/gopt/test/layout_transform_pass.cpp
+++ b/src/gopt/test/layout_transform_pass.cpp
@@ -812,7 +812,9 @@ TEST(TestLayoutTransform, Resnet18_F16) {
                    .add_pass<LayoutTransformPass>(std::move(ctx), std::move(solver))
                    .add_pass<ShuffleShuffleRemovePass>()
                    .add_pass(FuseNCHW4Int8Preprocess::make())
 #if CUDA_VERSION >= 10020
                    .add_pass<FoldingConvBiasDimshufflePass>()
 #endif
                    .add_pass<ParamFusePass>()
                    .add_pass<ParamMergePass>()
                    .apply({{output}})
@@ -1205,4 +1207,112 @@ TEST(TestLayoutTransform, MobileNetV2_NCHW44_DOT) {
    MGB_ASSERT_TENSOR_EQ(t1, t2);
 }

 #if MGB_CUDA
 TEST(TestLayoutTransform, Concat) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY(6, 1);

    constexpr size_t N = 16, C = 3, H = 736, W = 1280;
    HostTensorGenerator<dtype::Uint8> gen;

    auto graph = ComputingGraph::make();
    auto h2d = opr::Host2DeviceCopy::make(*graph, gen({N, C, H, W}, cn));
    auto data = opr::TypeCvt::make(h2d, dtype::Float32());
    auto sub_128 = data + (-128);
    auto x = opr::TypeCvt::make(sub_128, dtype::QuantizedS8(1.f));
    auto mkcvar = [&](const char* name, const TensorShape& shp, const DType& dtype) {
        return opr::TypeCvt::make(
                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn)).rename(name),
                dtype);
    };
    auto w = mkcvar("w", {2, 3, 3, 3}, dtype::QuantizedS8(1.f));
    auto b = mkcvar("b", {1, 2, 1, 1}, dtype::QuantizedS32(1.f));
    opr::ConvBias::Param param;
    param.format = opr::ConvBias::Param::Format::NCHW;
    param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 1;
    auto conv_1 = opr::ConvBias::make(
            x, w, b, param, {}, OperatorNodeConfig(dtype::QuantizedS8(1.f)));
    auto conv_1_cat = opr::Concat::make({conv_1, -conv_1}, 1);

    auto w2 = mkcvar("w", {4, 4, 3, 3}, dtype::QuantizedS8(1.f));
    auto b2 = mkcvar("b", {1, 4, 1, 1}, dtype::QuantizedS32(1.f));
    auto conv_2 = opr::ConvBias::make(
            conv_1_cat, w2, b2, param, {}, OperatorNodeConfig(dtype::QuantizedS8(1.f)));
    auto conv_2_cat = opr::Concat::make({conv_2, -conv_2}, 1);
    auto w3 = mkcvar("w", {16, 8, 3, 3}, dtype::QuantizedS8(1.f));
    auto b3 = mkcvar("b", {1, 16, 1, 1}, dtype::QuantizedS32(1.f));
    auto y = opr::ConvBias::make(
            conv_2_cat, w3, b3, param, {}, OperatorNodeConfig(dtype::QuantizedS8(1.f)));

    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({y}, strategy);

    using OprFormatConfigID = LayoutTransformContext::OprFormatConfigID;
    using OprList = LayoutTransformContext::OprList;
    using Attribute = LayoutTransformContext::Attribute;
    using Target = LayoutTransformContext::Target;
    OprList opr_list = {
            opr::ConvBiasForward::typeinfo(), opr::ElemwiseMultiType::typeinfo(),
            opr::Elemwise::typeinfo(),        opr::TypeCvt::typeinfo(),
            opr::Concat::typeinfo(),
    };
    SmallVector<TensorFormats> available_tensor_formats = {
            TensorFormats::NCHW, TensorFormats::NCHWc4};
    Attribute attribute = {
            OprFormatConfigID::NCHW, TensorFormats::NCHW, Target::UNSPEC};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats), attribute);
    ctx->add_opr_config(
            opr::ConvBiasForward::typeinfo(),
            {OprFormatConfigID::NCHW, OprFormatConfigID::NCHW4});
 #if MGB_WITH_CACHED_TEST
    auto profiler = std::make_unique<ProfilerMock>(
            static_cast<const uint8_t*>(TestLayoutTransform_Concat.data()),
            TestLayoutTransform_Concat.size());
 #else
    auto profiler =
            ProfilerBase::make_cached_profiler("TestLayoutTransform.Concat.cache");
 #endif
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_out_vars =
            gopt::GraphOptimizer{}
                    .add_pass<LayoutTransformPass>(std::move(ctx), std::move(solver))
                    .add_pass<ShuffleShuffleRemovePass>()
                    .add_pass(FuseNCHW4Int8Preprocess::make())
 #if CUDA_VERSION >= 10020
                    .add_pass<FoldingConvBiasDimshufflePass>()
                    .add_pass<FoldingConvBiasTypecvtPass>()
 #endif
                    .add_pass<ParamFusePass>()
                    .add_pass<ParamMergePass>()
                    .apply(SymbolVarArray{y})
                    .endpoint_vars();
    const auto& v = new_out_vars[0];
    using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
    std::vector<OutputSpecItem> outs;
    for (auto&& i : new_out_vars) {
        outs.emplace_back(OutputSpecItem{i, {}});
    }
    GraphProfiler gprof{graph.get()};
    auto func = graph->compile(outs);
    func->execute();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("conv_cat.json"));
    SmallVector<cg::OperatorNodeBase*> oprs;
    auto cb = [&oprs](cg::OperatorNodeBase* opr) {
        if (opr->same_type<opr::Concat>()) {
            oprs.push_back(opr);
        }
    };
    cg::DepOprIter{cb}.add(v.node()->owner_opr());
    ASSERT_EQ(oprs.size(), 4);
    ASSERT_EQ(oprs[0]->output(0)->shape().ndim, 4);
    ASSERT_EQ(oprs[2]->output(0)->shape().ndim, 5);
 }
 #endif
 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}