test(mgb/gopt): add testcase for global layout transform

GitOrigin-RevId: f9669e1ba0
3 years ago · a3cd3fc74f
--- a/src/gopt/impl/dynamic_programming_solver.cpp
+++ b/src/gopt/impl/dynamic_programming_solver.cpp
@@ -28,7 +28,10 @@ public:
 private:
    using TensorFormatsBitSet = uint32_t;
    using State = SmallVector<TensorFormatsBitSet>;
    static constexpr uint32_t MAX_TENSOR_FORMATS = sizeof(TensorFormatsBitSet);
    /// 1bit represents one kind of tensor formats
    static constexpr uint32_t BITS_PER_BYTE = 8;
    static constexpr uint32_t MAX_TENSOR_FORMATS =
            sizeof(TensorFormatsBitSet) * BITS_PER_BYTE;
    TensorFormatsBitSet add(TensorFormatsBitSet& set, TensorFormats fmt) {
        mgb_assert(static_cast<uint32_t>(fmt) < MAX_TENSOR_FORMATS);
        set |= (1 << static_cast<uint32_t>(fmt));
--- a/src/gopt/impl/layout_transform_pass.cpp
+++ b/src/gopt/impl/layout_transform_pass.cpp
@@ -111,8 +111,6 @@ void LayoutTransformPass::apply(OptState& opt) const {
                    }
                    new_var = reformat({new_var});
                }
                if (from != to && !new_var->shape().is_scalar())
                    new_var = reformat({new_var});
                new_inp[i] = new_var;
            }
            VarNode* new_out;
@@ -164,7 +162,9 @@ void LayoutTransformPass::apply(OptState& opt) const {
            }
        } else {
            auto new_opr = rewriter.auto_replace_outputs(opr);
            var2fmts[new_opr->output(0)] = base_fmt;
            for (auto&& ov : new_opr->usable_output()) {
                var2fmts[ov] = base_fmt;
            }
        }
    };
    opt.graph().iter(on_opr);
--- a/src/gopt/impl/opr_tensor_formats_config.cpp
+++ b/src/gopt/impl/opr_tensor_formats_config.cpp
@@ -245,19 +245,26 @@ struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> {
            if (i == 2)
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS32;
            else
                available &= opr->input(i)->dtype().enumv() ==
                                     DTypeEnum::Quantized4Asymm ||
                             opr->input(i)->dtype().enumv() ==
                                     DTypeEnum::QuantizedS4;
            else {
                bool i4_config = opr->input(i)->dtype().enumv() ==
                                         DTypeEnum::Quantized4Asymm ||
                                 opr->input(i)->dtype().enumv() ==
                                         DTypeEnum::QuantizedS4;
                bool i8_config = opr->input(i)->dtype().enumv() ==
                                 DTypeEnum::QuantizedS8;
                available &= (i4_config || i8_config);
            }
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &=
        bool i4_config =
                opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
                opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
        bool i8_config =
                opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        available &= (i4_config || i8_config);
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
@@ -496,6 +503,38 @@ struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
    }
 };

 template <>
 struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
                                       OprFormat::NHWC> {
    using Opr = opr::ConvolutionBackwardData;
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW4;
        bool available = true;
        for (size_t i = 0; i < opr->input().size(); ++i) {
            available &=
                    opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE;
        config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
                                       TensorFormats::NHWC,
                                       TensorFormats::NHWC};
        config.output_tensor_formats = {TensorFormats::NHWC};
        if (available)
            return config;
        return None;
    }
 };

 struct StaticData {
    struct KeyHash {
        size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const {
@@ -543,6 +582,7 @@ StaticData::StaticData() {
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4);

    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NHWC);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4);

    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW);
--- a/src/gopt/impl/profiler_impl.cpp
+++ b/src/gopt/impl/profiler_impl.cpp
@@ -17,7 +17,6 @@
 #include "megbrain/graph/event.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/plugin/base.h"
@@ -167,11 +166,12 @@ private:
    static constexpr float PROFILE_TIME_OUT = 1e7;
    using ReformatAttribute = ReformatKey::Attribute;
    /*!
     * \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
     * \brief profile opr format agnostic operators (like elemwise, elemwise
     * multi type, typecvt etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_format the original tensor format of the operator node.
     * \param available_tensor_formats the available tensor formats 
     * \param available_tensor_formats the available tensor formats
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
@@ -220,7 +220,7 @@ private:
                    ReformatAttribute::DEFAULT) const;
    float profile_var_node(const VarNode* var, TensorFormats base_format,
                           const ReformatKey& key) const;
    int m_runs; /// sample times of the profiler
    int m_runs;  /// sample times of the profiler
 };

 ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
@@ -281,10 +281,6 @@ ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
    record.opr = opr;
    auto& costs = record.costs;
    for (auto&& i : available_configs) {
        /// XXXX remove later
        if (i.opr_format == OprFormat::NCHW &&
            opr->input(0)->dtype().enumv() != DTypeEnum::Float32)
            continue;
        costs[i.opr_format] =
                profile_operator(opr, base_config, i, extra_attribute);
    }
@@ -403,8 +399,8 @@ float ProfilerImpl::profile_var_node(const VarNode* var,
    auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
            var, base_format, key);
    auto y = builder({aligned_var.node()});
    if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(),
                           TensorFormat{}))

    if (!m_var_node_filter(var, aligned_tensor_shape, y->shape(), key))
        return PROFILE_TIME_OUT;
    ThinHashSet<OperatorNodeBase*> set;
    DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); });
@@ -533,6 +529,17 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold)
          m_var_node_threshold{var_node_threshold} {
    m_opr_filter = [this](const OperatorNodeBase* opr,
                          OperatorNodeBase* new_opr) {
        /// \note: for the considerations of performance, we skip nchw(naive)
        /// kernels for conv bias on CUDA platform. to remove this later
        if (auto conv = try_cast_as_op<opr::ConvBiasForward>(new_opr)) {
            if (conv->output(0)->comp_node().device_type() ==
                        CompNode::DeviceType::CUDA &&
                conv->input(0)->dtype().category() ==
                        DTypeCategory::QUANTIZED &&
                conv->param().format == OprFormat::NCHW) {
                return false;
            }
        }
        float comp1 = m_opr_footprint.get_computation(
                const_cast<OperatorNodeBase*>(opr));
        float comp2 = m_opr_footprint.get_computation(new_opr);
@@ -541,18 +548,27 @@ ProfilerBase::ProfilerBase(float opr_threshold, float var_node_threshold)
        return true;
    };
    m_var_node_filter = [this](const VarNode* var, TensorShape from,
                               TensorShape to, TensorFormat format) {
        TensorFormat default_;
        TensorLayout orig_ly, from_ly, to_ly;
        if (format == default_) {
            orig_ly = {var->shape(), var->dtype()};
            from_ly = {from, var->dtype()};
            to_ly = {to, var->dtype()};
        } else {
            orig_ly = {var->shape(), var->dtype(), format};
            from_ly = {from, var->dtype(), format};
            to_ly = {to, var->dtype(), format};
                               TensorShape to, ReformatKey key) {
        /// \note: due to the alignment requirement of low-bit tensor, we skip
        /// some layout transform for low-bit tensors. The skipped layout
        /// transforms do not have corresponding dnn kernel and cannot be
        /// implemented by tensor manip operators (like reshape, dimshuffle,
        /// subtensor, etc.).
        if (var->dtype().enumv() == DTypeEnum::QuantizedS4 ||
            var->dtype().enumv() == DTypeEnum::Quantized4Asymm) {
            if (key.input_format == TensorFormats::NCHW &&
                key.output_format != TensorFormats::NHWC &&
                key.output_format != TensorFormats::NCHWc64) {
                return false;
            }
            if (key.output_format == TensorFormats::NCHW &&
                key.input_format != TensorFormats::NHWC &&
                key.input_format != TensorFormats::NCHWc64) {
                return false;
            }
        }
        TensorLayout orig_ly = {var->shape(), var->dtype()},
                     from_ly = {from, var->dtype()}, to_ly = {to, var->dtype()};
        float orig_memory = orig_ly.span().dist_byte() * 2.f;
        float reformat_memory =
                from_ly.span().dist_byte() + to_ly.span().dist_byte();
--- a/src/gopt/impl/reformat_manager.cpp
+++ b/src/gopt/impl/reformat_manager.cpp
@@ -329,10 +329,21 @@ ReformatManager::ReformatImpl ReformatManager::get(
        const ReformatKey& key) const {
    using Attribute = ReformatKey::Attribute;
    MGB_TRY {
        auto find = m_cache.find(key);
        if (find != m_cache.end()) {
            auto rst = find->second;
            return rst;
        {
            auto find = m_cache.find(key);
            if (find != m_cache.end()) {
                auto rst = find->second;
                return rst;
            }
        }
        if (key.attribute == Attribute::AUTO_PADDING_NHWC) {
            auto key_ = key;
            key_.attribute = Attribute::DEFAULT;
            auto find = m_cache.find(key_);
            if (find != m_cache.end()) {
                auto rst = find->second;
                return rst;
            }
        }
        mgb_assert(!(key.attribute & Attribute::IMAGE2D) &&
                   !(key.attribute & Attribute::IC_SMALL));
--- a/src/gopt/include/megbrain/gopt/global_layout_transform.h
+++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h
@@ -222,8 +222,9 @@ public:
    };
    using OprFilter = thin_function<bool(const cg::OperatorNodeBase*,
                                         cg::OperatorNodeBase*)>;
    using VarNodeFilter = thin_function<bool(const VarNode*, TensorShape,
                                             TensorShape, TensorFormat)>;
    using VarNodeFilter =
            thin_function<bool(const VarNode*, TensorShape, TensorShape,
                               ReformatManager::ReformatKey)>;

    ProfilerBase(float opr_threshold = 2.f, float var_node_threshold = 2.f);
    ProfilerBase(OprFilter opr_filter, VarNodeFilter var_node_filter = {})
--- a/src/gopt/include/megbrain/gopt/reformat_manager.h
+++ b/src/gopt/include/megbrain/gopt/reformat_manager.h
@@ -146,18 +146,6 @@ private:
 };

 MGB_DEF_ENUM_CLASS_BIT_OPR(ReformatManager::ReformatKey::Attribute);
 //
 //TensorShape make_aligned_tensor_shape(
 //        const VarNode* var, TensorFormats orig_formats,
 //        TensorFormats target_formats,
 //        ReformatManager::ReformatKey::Attribute extra_attribute =
 //                ReformatManager::ReformatKey::Attribute::DEFAULT);
 //
 //TensorShape make_aligned_weight_shape(
 //        const VarNode* var, TensorFormats orig_formats,
 //        TensorFormats target_formats, TensorFormats extra_formats,
 //        ReformatManager::ReformatKey::Attribute extra_attribute =
 //                ReformatManager::ReformatKey::Attribute::DEFAULT);

 }  // namespace gopt
 }  // namespace mgb
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -4104,6 +4104,79 @@ TEST(TestGoptInference, PreProcessCaseAutopadNCHW64) {
                opr::RelayoutFormat::Param::Mode::NCHW_NCHW4);
 }

 TEST(TestGoptInference, PreProcessCaseAutopadNHWC) {
    REQUIRE_GPU(1);
    HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
    auto cn = CompNode::load("gpu0");
    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
    auto sm_ver = prop.major * 10 + prop.minor;
    if (sm_ver < 75) {
        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
               "expected: %d)\n",
               sm_ver, 75);
        return;
    }
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkcvar = [&](const char* name, const TensorShape& shp,
                      const DType& dtype) {
        return opr::TypeCvt::make(
                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
                        .rename(name),
                dtype);
    };
    size_t n = 2;
    size_t c = 3;
    size_t h = 32;
    size_t w = 32;
    auto host_x1 = gen({n, c, h, w}, cn);

    auto x = opr::Host2DeviceCopy::make(*graph, host_x1);
    auto x_u8_fp32 = opr::TypeCvt::make(x, dtype::Float32(), cn);
    auto x_s8_fp32 = x_u8_fp32 - 128;
    auto x_s8 = opr::TypeCvt::make(x_s8_fp32, dtype::QuantizedS8(2.5f), cn);
    auto host_val =
            std::make_shared<HostTensorND>(cn, dtype::QuantizedS8(2.5f));
    TensorShape scalar{1, 1, 1, 1};
    host_val->resize(scalar);
    auto ptr = host_val->raw_ptr();
    size_t size_bytes =
            TensorLayout{scalar, dtype::QuantizedS8(2.5f)}.span().dist_byte();
    std::memset(ptr, 0, size_bytes);
    auto padding = opr::ImmutableTensor::make(*graph, *host_val);
    padding = opr::Broadcast::make(padding, {n, 1, h, w});
    auto padded_x = opr::Concat::make({x_s8, padding}, 1);
    auto nhwc_x = opr::Dimshuffle::make(padded_x, {0, 2, 3, 1});
    auto weight = mkcvar("weight", {16, 3, 3, 4}, dtype::QuantizedS8(2.5f)),
         bias = mkcvar("bias", {1, 1, 1, 16}, dtype::QuantizedS32(6.25f));
    opr::ConvBias::Param param;
    param.format = opr::ConvBias::Param::Format::NHWC;
    param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 1;
    auto result =
            opr::ConvBias::make(nhwc_x, weight, bias, param, {},
                                OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
    auto y = opr::TypeCvt::make(result, dtype::Float32());
    SymbolVar y_opt;
    auto options = gopt::OptimizeForInferenceOptions{};
    options.enable_fuse_preprocess();
    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);

    graph->compile({{y_opt, {}}})
            ->to_json()
            ->writeto_fpath(output_file(
                    "TestGoptInference.PreProcessCaseAutopadNHWC.json"));

    HostTensorND host_y_opt, host_y;
    auto func = graph->compile({make_callback_copy(y, host_y),
                                make_callback_copy(y_opt, host_y_opt)});
    func->execute();
    MGB_ASSERT_TENSOR_NEAR(host_y, host_y_opt, 1e-5);
    ASSERT_TRUE(find_opr<opr::RelayoutFormat>(y_opt).param().mode ==
                opr::RelayoutFormat::Param::Mode::NCHW_NCHW4);
 }

 TEST(TestGoptInference, WarpAndPreProcessCase1) {
    REQUIRE_GPU(1);
    HostTensorGenerator<dtype::Uint8, RandomDistribution::UNIFORM> gen(0, 255);
--- a/src/gopt/test/layout_transform_pass.cpp
+++ b/src/gopt/test/layout_transform_pass.cpp
@@ -10,7 +10,8 @@
 * implied.
 */

 #include "./helper.h"
 #include "./network.h"
 #include "megbrain/comp_node_env.h"
 #include "megbrain/gopt/global_layout_transform.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/dnn/pooling.h"
@@ -24,23 +25,145 @@ using namespace gopt;
 using namespace serialization;

 #if MGB_CUDA
 TEST(TestLayoutTransform, Feature) {
    auto inp_file = InputFile::make_fs("./feat.mdl");
 namespace {
 //! find first the operator of specific type; raise exception if not found
 template <typename T>
 T& find_opr(SymbolVar endpoint) {
    T* found = nullptr;
    auto cb = [&found](cg::OperatorNodeBase* opr) {
        if (!found && opr->same_type<T>()) {
            found = &opr->cast_final_safe<T>();
        }
    };
    cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
    mgb_assert(found, "not found opr from %s", endpoint.node()->name().c_str());
    return *found;
 }

    auto format = GraphLoader::identify_graph_dump_format(*inp_file);
    ASSERT_TRUE(format.valid());
    auto loader = GraphLoader::make(std::move(inp_file), format.val());
 template <typename T>
 size_t find_opr_num(SymbolVar endpoint) {
    size_t opr_num = 0;
    auto cb = [&opr_num](cg::OperatorNodeBase* opr) {
        if (opr->same_type<T>()) {
            printf("%s, %s\n", opr->cname(), opr->dyn_typeinfo()->name);
            opr_num++;
        }
    };
    cg::DepOprIter{cb}.add(endpoint.node()->owner_opr());
    return opr_num;
 }
 }  // namespace

 TEST(TestLayoutTransform, Resnet18_QS8) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
    auto sm_ver = prop.major * 10 + prop.minor;
    if (sm_ver < 75) {
        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
               "expected: %d)\n",
               sm_ver, 75);
        return;
    }
    Network network(cn);
    /// batch size = 1 reduce test time
    auto output = make_resnet18(network, 16, dtype::QuantizedS8{1.f});
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({{output}}, strategy);

    GraphLoader::LoadConfig load_config;
    load_config.comp_graph = ComputingGraph::make();
    auto&& graph_opt = load_config.comp_graph->options();
    graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity();
    graph_opt.graph_opt.enable_fuse_conv_bias_with_z();
    auto ret = loader->load(load_config, false);
    HostTensorND t1;
    auto func1 = network.graph->compile({make_callback_copy(output, t1)});
    func1->execute();

    using OprFormat = LayoutTransformContext::OprFormat;
    using OprList = LayoutTransformContext::OprList;
    using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
    using Attribute = LayoutTransformContext::Attribute;
    OprList opr_list = {
            opr::ConvBiasForward::typeinfo(),
            opr::ElemwiseMultiType::typeinfo(),
            opr::Elemwise::typeinfo(),
            opr::TypeCvt::typeinfo(),
            opr::PoolingForward::typeinfo(),
            opr::WarpPerspectiveForward::typeinfo(),
    };
    SmallVector<TensorFormats> available_tensor_formats = {
            TensorFormats::NCHW, TensorFormats::NHWC, TensorFormats::NCHWc4,
            TensorFormats::NCHWc32, TensorFormats::CHWNc4};
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
                           ReformatAttribute::AUTO_PADDING_NHWC};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
                         OprFormat::NHWC})
            .add_opr_config(opr::PoolingForward::typeinfo(),
                            {OprFormat::NCHW4, OprFormat::NCHW32,
                             OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_output = gopt::GraphOptimizer{}
                              .add_pass<FuseConvBiasNonlinPass>()
                              .add_pass<FuseConvBiasZPass>()
                              .add_pass<LayoutTransformPass>(std::move(ctx),
                                                             std::move(solver))
                              .add_pass<ShuffleShuffleRemovePass>()
                              .add_pass(FuseNCHW4Int8Preprocess::make())
                              .add_pass<FoldingConvBiasDimshufflePass>()
                              .add_pass<ParamFusePass>()
                              .add_pass<ParamMergePass>()
                              .apply({{output}})
                              .endpoint_vars();
    auto new_out_var = new_output[0];
    /// check global layout transform pass
    auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(new_out_var);
    ASSERT_EQ(nr_dimshuffle, 3u);
    /// check pass fuse conv bias with z
    auto nr_elemwise_mult_type =
            find_opr_num<opr::ElemwiseMultiType>(new_out_var);
    ASSERT_EQ(nr_elemwise_mult_type, 4u);
    /// 21 convolutions, 21 weights and 21 bias, total 42 parameters
    const auto& param_merge =
            find_opr<opr::MultipleDeviceTensorHolder>(new_out_var);
    ASSERT_EQ(param_merge.output().size(), 42u);
    /// check first conv format
    const auto& first_conv = find_opr<opr::ConvBiasForward>(new_out_var);
    const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
    ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4);

    GraphProfiler gprof{network.graph.get()};
    HostTensorND t2;
    auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)});
    func2->execute();
    gprof.to_json_full(func2.get())
            ->writeto_fpath(output_file("resnet18_qs8.json"));
    /// check correct
    MGB_ASSERT_TENSOR_EQ(t1, t2);
 }

 TEST(TestLayoutTransform, Resnet18_QS4) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
    auto sm_ver = prop.major * 10 + prop.minor;
    if (sm_ver < 75) {
        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
               "expected: %d)\n",
               sm_ver, 75);
        return;
    }
    Network network(cn);
    auto output = make_resnet18(network, 16, dtype::QuantizedS4{1.f});
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy);
    gopt::modify_opr_algo_strategy_inplace({{output}}, strategy);

    HostTensorND t1;
    auto func1 = network.graph->compile({make_callback_copy(output, t1)});
    func1->execute();

    using OprFormat = LayoutTransformContext::OprFormat;
    using OprList = LayoutTransformContext::OprList;
@@ -55,74 +178,113 @@ TEST(TestLayoutTransform, Feature) {
            opr::WarpPerspectiveForward::typeinfo(),
    };
    SmallVector<TensorFormats> available_tensor_formats = {
            TensorFormats::NCHWc4, TensorFormats::NCHWc32,
            TensorFormats::CHWNc4};
    Attribute attribute = {OprFormat::NCHW4, TensorFormats::NCHWc4,
                           ReformatAttribute::DEFAULT};
            TensorFormats::NCHW,    TensorFormats::NHWC,
            TensorFormats::NCHWc4,  TensorFormats::NCHWc32,
            TensorFormats::NCHWc64, TensorFormats::CHWNc4};
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
                           ReformatAttribute::AUTO_PADDING_NHWC};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4})
                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
                         OprFormat::NHWC, OprFormat::NCHW64})
            .add_opr_config(
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4})
            .add_opr_config(opr::WarpPerspectiveForward::typeinfo(),
                            OprFormat::NCHW4);
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
                     OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
    auto filter = [](const GraphPartition& partition) {
        auto has_nchw4_conv = false;
        for (auto&& opr : partition.all_oprs()) {
            if (opr->dyn_typeinfo() == opr::ConvBiasForward::typeinfo()) {
                auto& conv = opr->cast_final_safe<opr::ConvBiasForward>();
                if (conv.param().format ==
                    LayoutTransformContext::OprFormat::NCHW4) {
                    has_nchw4_conv = true;
                    break;
                }
            }
        }
        return has_nchw4_conv;
    };
    std::unique_ptr<SolverBase> solver{new DynamicProgrammingSolver(
            std::move(profiler), std::move(filter))};
    auto new_out_vars = gopt::GraphOptimizer{}
                                .add_pass<FuseConvBiasNonlinPass>()
                                .add_pass<FuseConvBiasZPass>()
                                .add_pass<LayoutTransformPass>(
                                        std::move(ctx), std::move(solver))
                                .add_pass<ShuffleShuffleRemovePass>()
                                .add_pass(FuseNCHW4Int8Preprocess::make())
                                .add_pass<FoldingConvBiasDimshufflePass>()
                                .add_pass<ParamFusePass>()
                                .add_pass<ParamMergePass>()
                                .apply(ret.output_var_list)
                                .endpoint_vars();
    auto dumper = GraphDumper::make(OutputFile::make_fs("model_opt.mgb"));
    dumper->dump({new_out_vars});
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_output = gopt::GraphOptimizer{}
                              .add_pass<FuseConvBiasNonlinPass>()
                              .add_pass<FuseConvBiasZPass>()
                              .add_pass<LayoutTransformPass>(std::move(ctx),
                                                             std::move(solver))
                              .add_pass<ShuffleShuffleRemovePass>()
                              .add_pass(FuseNCHW4Int8Preprocess::make())
                              .add_pass<FoldingConvBiasDimshufflePass>()
                              .add_pass<ParamFusePass>()
                              .add_pass<ParamMergePass>()
                              .apply({{output}})
                              .endpoint_vars();
    auto new_out_var = new_output[0];
    /// check global layout transform pass
    auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(new_out_var);
    ASSERT_EQ(nr_dimshuffle, 3u);
    /// check pass fuse conv bias with z
    auto nr_elemwise_mult_type =
            find_opr_num<opr::ElemwiseMultiType>(new_out_var);
    ASSERT_EQ(nr_elemwise_mult_type, 4u);
    /// 21 convolutions, 21 weights and 21 bias, total 42 parameters
    const auto& param_merge =
            find_opr<opr::MultipleDeviceTensorHolder>(new_out_var);
    ASSERT_EQ(param_merge.output().size(), 42u);
    /// check first conv format
    const auto& first_conv = find_opr<opr::ConvBiasForward>(new_out_var);
    const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
    ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NHWC);

    GraphProfiler gprof{network.graph.get()};
    HostTensorND t2;
    auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)});
    func2->execute();
    gprof.to_json_full(func2.get())
            ->writeto_fpath(output_file("resnet18_qs4.json"));
    MGB_ASSERT_TENSOR_EQ(t1, t2);
 }

 TEST(TestLayoutTransform, Detection) {
    auto inp_file = InputFile::make_fs("./det.mdl");
    static const char* magic = "mgbteset0";
    size_t skip_size = sizeof(magic) + sizeof(uint32_t);
    char skip[skip_size];
    inp_file->read(skip, skip_size);
 TEST(TestLayoutTransform, Resnet18_NCHW64) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
    auto sm_ver = prop.major * 10 + prop.minor;
    if (sm_ver < 75) {
        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
               "expected: %d)\n",
               sm_ver, 75);
        return;
    }
    Network network(cn);
    auto output = make_resnet18(network, 64, dtype::QuantizedS4{1.f});
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({{output}}, strategy);

    auto format = GraphLoader::identify_graph_dump_format(*inp_file);
    ASSERT_TRUE(format.valid());
    auto loader = GraphLoader::make(std::move(inp_file), format.val());
    HostTensorND t1;
    auto func1 = network.graph->compile({make_callback_copy(output, t1)});
    func1->execute();

    GraphLoader::LoadConfig load_config;
    load_config.comp_graph = ComputingGraph::make();
    auto&& graph_opt = load_config.comp_graph->options();
    graph_opt.graph_opt.enable_fuse_conv_bias_nonlinearity();
    graph_opt.graph_opt.enable_fuse_conv_bias_with_z();
    auto ret = loader->load(load_config, false);
    SymbolVar new_out_var;
    auto options = gopt::OptimizeForInferenceOptions{};
    options.enable_nchw64();
    unpack_vector(gopt::optimize_for_inference({output}, options), new_out_var);

    GraphProfiler gprof{network.graph.get()};
    HostTensorND t2;
    auto func2 = network.graph->compile({make_callback_copy(new_out_var, t2)});
    func2->execute();
    gprof.to_json_full(func2.get())
            ->writeto_fpath(output_file("resnet18_nchw64.json"));
    MGB_ASSERT_TENSOR_EQ(t1, t2);
 }

 TEST(TestLayoutTransform, Detection_QS8) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
    auto sm_ver = prop.major * 10 + prop.minor;
    if (sm_ver < 75) {
        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
               "expected: %d)\n",
               sm_ver, 75);
        return;
    }
    Network network(cn);
    auto outputs = make_det(network, 16, dtype::QuantizedS8{1.f});
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({ret.output_var_list}, strategy);
    gopt::modify_opr_algo_strategy_inplace({outputs}, strategy);

    using OprFormat = LayoutTransformContext::OprFormat;
    using OprList = LayoutTransformContext::OprList;
@@ -130,8 +292,6 @@ TEST(TestLayoutTransform, Detection) {
    using Attribute = LayoutTransformContext::Attribute;
    OprList opr_list = {
            opr::ConvBiasForward::typeinfo(),
            opr::ConvolutionForward::typeinfo(),
            opr::ConvolutionBackwardData::typeinfo(),
            opr::ElemwiseMultiType::typeinfo(),
            opr::Elemwise::typeinfo(),
            opr::TypeCvt::typeinfo(),
@@ -143,51 +303,228 @@ TEST(TestLayoutTransform, Detection) {
            TensorFormats::NCHWc4,  TensorFormats::NCHWc32,
            TensorFormats::NCHWc64, TensorFormats::CHWNc4};
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
                           ReformatAttribute::DEFAULT};
                           ReformatAttribute::AUTO_PADDING_NHWC};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(
               opr::ConvBiasForward::typeinfo(),
               {OprFormat::NCHW, OprFormat::NHWC, OprFormat::NCHW4,
                OprFormat::NCHW32, OprFormat::NCHW64, OprFormat::CHWN4})
            .add_opr_config(opr::ConvolutionForward::typeinfo(),
                            {OprFormat::NCHW, OprFormat::NCHW4})
            .add_opr_config(opr::ConvolutionBackwardData::typeinfo(),
                            {OprFormat::NCHW, OprFormat::NCHW4})
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
                         OprFormat::NHWC, OprFormat::NCHW64})
            .add_opr_config(
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NHWC,
                     OprFormat::NCHW64, OprFormat::CHWN4})
            .add_opr_config(
                    opr::WarpPerspectiveForward::typeinfo(),
                    {OprFormat::NHWC, OprFormat::NCHW4, OprFormat::NCHW64});
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
                     OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_outputs = gopt::GraphOptimizer{}
                               .add_pass<FuseConvBiasNonlinPass>()
                               .add_pass<FuseConvBiasZPass>()
                               .add_pass<LayoutTransformPass>(std::move(ctx),
                                                              std::move(solver))
                               .add_pass<ShuffleShuffleRemovePass>()
                               .add_pass(FuseNCHW4Int8Preprocess::make())
                               .add_pass<FoldingConvBiasDimshufflePass>()
                               .add_pass<ParamFusePass>()
                               .add_pass<ParamMergePass>()
                               .apply({{outputs}})
                               .endpoint_vars();

    GraphProfiler gprof{network.graph.get()};
    using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
    std::vector<OutputSpecItem> output_spec;
    for (const auto& i : new_outputs) {
        output_spec.emplace_back(OutputSpecItem{i, {}});
    }
    auto func = network.graph->compile(output_spec);
    func->execute();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs8.json"));
 }

 TEST(TestLayoutTransform, Detection_QS4) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    auto&& prop = CompNodeEnv::from_comp_node(cn).cuda_env().device_prop;
    auto sm_ver = prop.major * 10 + prop.minor;
    if (sm_ver < 75) {
        printf("This testcast ignored due to insufficient cuda cap(got: %d, "
               "expected: %d)\n",
               sm_ver, 75);
        return;
    }
    Network network(cn);
    auto outputs = make_det(network, 16, dtype::QuantizedS4{1.f});
    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({outputs}, strategy);

    using OprFormat = LayoutTransformContext::OprFormat;
    using OprList = LayoutTransformContext::OprList;
    using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
    using Attribute = LayoutTransformContext::Attribute;
    OprList opr_list = {
            opr::ConvBiasForward::typeinfo(),
            opr::ElemwiseMultiType::typeinfo(),
            opr::Elemwise::typeinfo(),
            opr::TypeCvt::typeinfo(),
            opr::PoolingForward::typeinfo(),
            opr::WarpPerspectiveForward::typeinfo(),
    };
    SmallVector<TensorFormats> available_tensor_formats = {
            TensorFormats::NCHW,    TensorFormats::NHWC,
            TensorFormats::NCHWc4,  TensorFormats::NCHWc32,
            TensorFormats::NCHWc64, TensorFormats::CHWNc4};
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
                           ReformatAttribute::AUTO_PADDING_NHWC};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::CHWN4,
                         OprFormat::NHWC, OprFormat::NCHW64})
            .add_opr_config(
                    opr::PoolingForward::typeinfo(),
                    {OprFormat::NCHW4, OprFormat::NCHW32, OprFormat::NCHW64,
                     OprFormat::NHWC, OprFormat::CHWN4});
    auto profiler = ProfilerBase::make_profiler();
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto new_out_vars = gopt::GraphOptimizer{}
                                .add_pass<LayoutTransformPass>(
                                        std::move(ctx), std::move(solver))
                                .add_pass<ShuffleShuffleRemovePass>()
                                .add_pass(FuseNCHW4Int8Preprocess::make())
                                .add_pass<FoldingConvBiasDimshufflePass>()
                                .add_pass<ParamFusePass>()
                                .add_pass<ParamMergePass>()
                                .apply(ret.output_var_list)
                                .endpoint_vars();
    auto new_outputs = gopt::GraphOptimizer{}
                               .add_pass<FuseConvBiasNonlinPass>()
                               .add_pass<FuseConvBiasZPass>()
                               .add_pass<LayoutTransformPass>(std::move(ctx),
                                                              std::move(solver))
                               .add_pass<ShuffleShuffleRemovePass>()
                               .add_pass(FuseNCHW4Int8Preprocess::make())
                               .add_pass<FoldingConvBiasDimshufflePass>()
                               .add_pass<ParamFusePass>()
                               .add_pass<ParamMergePass>()
                               .apply({{outputs}})
                               .endpoint_vars();

    GraphProfiler gprof{network.graph.get()};
    using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
    std::vector<OutputSpecItem> outs(new_out_vars.size());
    for (size_t i = 0; i < new_out_vars.size(); ++i) {
        auto cb = [](DeviceTensorND& /* d */) {};
        outs[i] = std::make_pair(new_out_vars[i], cb);
    std::vector<OutputSpecItem> output_spec;
    for (const auto& i : new_outputs) {
        output_spec.emplace_back(OutputSpecItem{i, {}});
    }
    GraphProfiler gprof{load_config.comp_graph.get()};
    auto func = load_config.comp_graph->compile(outs);
    for (size_t i = 0; i < 10; ++i)
        func->execute();
    func->wait();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("det.json"));
    auto func = network.graph->compile(output_spec);
    func->execute();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("det_qs4.json"));
 }

 /*!
 * test the performance of the solver when network is wide.
 */
 TEST(TestLayoutTransform, Wide) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    Network network(cn);
    auto data = network.add_var("data", {16, 3, 64, 64});
    auto f = network.add_conv(data, 16, {3, 3}, dtype::Float32(), true, {2, 2},
                              {1, 1});
    f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1});
    f = network.add_conv(f, 16, {3, 3}, dtype::Float32(), true, {2, 2}, {1, 1});
    SymbolVarArray stages;
    for (size_t i = 0; i < 8; ++i) {
        f = f * f + f;
        stages.push_back(f);
    }
    auto y = stages[0];
    for (size_t i = 1; i < stages.size(); ++i) {
        y = y + stages[i];
    }

    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({y}, strategy);

    using OprFormat = LayoutTransformContext::OprFormat;
    using OprList = LayoutTransformContext::OprList;
    using ReformatAttribute = LayoutTransformContext::ReformatAttribute;
    using Attribute = LayoutTransformContext::Attribute;
    OprList opr_list = {
            opr::ConvBiasForward::typeinfo(),
            opr::Elemwise::typeinfo(),
    };
    SmallVector<TensorFormats> available_tensor_formats = {TensorFormats::NCHW,
                                                           TensorFormats::NHWC};
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW,
                           ReformatAttribute::DEFAULT};
    auto ctx = std::make_unique<LayoutTransformContext>(
            std::move(opr_list), std::move(available_tensor_formats),
            attribute);
    ctx->add_opr_config(opr::ConvBiasForward::typeinfo(),
                        {OprFormat::NCHW, OprFormat::NHWC});
    auto profiler = ProfilerBase::make_profiler();
    std::unique_ptr<SolverBase> solver{
            new DynamicProgrammingSolver(std::move(profiler))};
    auto v = gopt::GraphOptimizer{}
                     .add_pass<FuseConvBiasNonlinPass>()
                     .add_pass<FuseConvBiasZPass>()
                     .add_pass<LayoutTransformPass>(std::move(ctx),
                                                    std::move(solver))
                     .add_pass<ShuffleShuffleRemovePass>()
                     .add_pass<ParamFusePass>()
                     .add_pass<ParamMergePass>()
                     .apply({{y}})
                     .endpoint_vars();
    const auto& sym_o = v[0];
    GraphProfiler gprof{network.graph.get()};
    auto func = network.graph->compile({{sym_o, {}}});
    func->execute();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("wide.json"));
    /// check global layout transform pass, no dimshuffle
    auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(sym_o);
    ASSERT_EQ(nr_dimshuffle, 0u);
    auto nr_param_merge = find_opr_num<opr::MultipleDeviceTensorHolder>(sym_o);
    ASSERT_EQ(nr_param_merge, 1u);
    /// check first conv format
    const auto& first_conv = find_opr<opr::ConvBiasForward>(sym_o);
    const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
    ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW);
 }

 TEST(TestLayoutTransform, ElemwiseMultiType) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    Network network(cn);
    auto x = network.add_var("x", {64, 64, 1, 2});
    auto y = network.add_var("y", {64, 64, 1, 2});
    x = network.add_type_cvt(x, dtype::QuantizedS4{1.f});
    y = network.add_type_cvt(y, dtype::QuantizedS4{1.f});
    auto x_ = network.add_type_cvt(x, dtype::Float32());
    auto y_ = network.add_type_cvt(y, dtype::Float32());
    auto z = network.add_elemwise({x_, y_}, dtype::Float32(),
                                  opr::Elemwise::Mode::FUSE_ADD_RELU);
    z = network.add_type_cvt(z, dtype::QuantizedS4{1.f});
    z = network.add_type_cvt(z, dtype::Float32());
    auto z2 = network.add_elemwise({x, y}, dtype::QuantizedS4{1.f},
                                   opr::Elemwise::Mode::FUSE_ADD_RELU);
    z2 = network.add_type_cvt(z2, dtype::Float32());
    HostTensorND t1;
    auto func1 = network.graph->compile({make_callback_copy(z, t1)});
    func1->execute();

    HostTensorND t3;
    auto func3 = network.graph->compile({make_callback_copy(z2, t3)});
    func3->execute();

    auto alter_x = opr::RelayoutFormat::make(
            x, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
    auto alter_y = opr::RelayoutFormat::make(
            y, megdnn::param::RelayoutFormat::Mode::NCHW_NCHW64);
    auto alter_z =
            network.add_elemwise({alter_x, alter_y}, dtype::QuantizedS4{1.f},
                                 opr::Elemwise::Mode::FUSE_ADD_RELU);
    alter_z = opr::RelayoutFormat::make(
            alter_z, megdnn::param::RelayoutFormat::Mode::NCHW64_NCHW);
    alter_z = network.add_type_cvt(alter_z, dtype::Float32());
    HostTensorND t2;
    auto func2 = network.graph->compile({make_callback_copy(alter_z, t2)});
    func2->execute();
    // MGB_ASSERT_TENSOR_EQ(t1, t3);
    MGB_ASSERT_TENSOR_EQ(t2, t3);
 }

 TEST(TestLayoutTransform, DetectionHead) {
@@ -196,7 +533,7 @@ TEST(TestLayoutTransform, DetectionHead) {
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);

    constexpr size_t N = 16, C = 3, H = 768, W = 1280;
    constexpr size_t N = 16, C = 3, H = 736, W = 1280;
    HostTensorGenerator<dtype::Uint8> gen;

    auto graph = ComputingGraph::make();
@@ -284,20 +621,71 @@ TEST(TestLayoutTransform, DetectionHead) {
                                .add_pass<ParamMergePass>()
                                .apply(SymbolVarArray{y})
                                .endpoint_vars();
    const auto& v = new_out_vars[0];
    using OutputSpecItem = cg::ComputingGraph::OutputSpecItem;
    std::vector<OutputSpecItem> outs(new_out_vars.size());
    for (size_t i = 0; i < new_out_vars.size(); ++i) {
        auto cb = [](DeviceTensorND& /* d */) {};
        outs[i] = std::make_pair(new_out_vars[i], cb);
    std::vector<OutputSpecItem> outs;
    for (const auto& i : new_out_vars) {
        outs.emplace_back(OutputSpecItem{i, {}});
    }
    GraphProfiler gprof{graph.get()};
    auto func = graph->compile(outs);
    for (size_t i = 0; i < 10; ++i)
        func->execute();
    func->wait();
    func->execute();
    gprof.to_json_full(func.get())->writeto_fpath(output_file("det_head.json"));
    /// check reformat
    auto nr_reformat = find_opr_num<opr::RelayoutFormat>(v);
    ASSERT_EQ(nr_reformat, 2u);
    /// check dimshuffle
    auto nr_dimshuffle = find_opr_num<opr::Dimshuffle>(v);
    ASSERT_EQ(nr_dimshuffle, 0u);
    /// check conv_bias
    auto nr_conv = find_opr_num<opr::ConvBiasForward>(v);
    ASSERT_EQ(nr_conv, 2u);
    /// check first conv format
    const auto& first_conv = find_opr<opr::ConvBiasForward>(v);
    const auto& cast = first_conv.cast_final_safe<opr::ConvBiasForward>();
    ASSERT_EQ(cast.param().format, opr::ConvBias::Param::Format::NCHW4_NHWC);
 }

 #endif

 TEST(TestLayoutTransform, CanonicalizeLayoutTransform) {
    constexpr size_t N = 64, C = 64, H = 1, W = 1;
    auto cn = CompNode::load("xpu0");
    Network network(cn);
    auto x = network.add_var("x", {N, C / 4, H, W, 4});
    x = network.add_type_cvt(x, dtype::QuantizedS4{1.f});
    using NamedTensorShape = megdnn::NamedTensorShape;
    auto src = NamedTensorShape::make_named_tensor_shape(
            NamedTensorShape::Format::NCHW4);
    auto dst = NamedTensorShape::make_named_tensor_shape(
            NamedTensorShape::Format::NHWC);
    auto [builder, _] = gopt::ReformatEmitter(src, dst).emit();
    MGB_MARK_USED_VAR(_);
    x = SymbolVar(builder({x.node()}));
    x = opr::Reshape::make(x, {N, H, W, C});
    x = network.add_type_cvt(x, dtype::Float32());

    SymbolVar another_x;
    unpack_vector(gopt::GraphOptimizer{}
                          .add_pass<gopt::ShuffleShuffleRemovePass>()
                          .apply({{x}})
                          .endpoint_vars(),
                  another_x);
    const auto& astype = find_opr<opr::TypeCvt>(x);
    EXPECT_TRUE(astype.input(0)->owner_opr()->dyn_typeinfo() ==
                opr::Host2DeviceCopy::typeinfo());
    const auto& another_astype = find_opr<opr::TypeCvt>(another_x);
    EXPECT_TRUE(another_astype.input(0)->owner_opr()->dyn_typeinfo() ==
                opr::Reshape::typeinfo());

    HostTensorND t1;
    auto func1 = network.graph->compile({make_callback_copy(x, t1)});
    func1->execute();

    HostTensorND t2;
    auto func2 = network.graph->compile({make_callback_copy(another_x, t2)});
    func2->execute();
    MGB_ASSERT_TENSOR_EQ(t1, t2);
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/test/network.cpp
+++ b/src/gopt/test/network.cpp
@@ -0,0 +1,237 @@
 /**
 * \file src/gopt/test/network.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./network.h"

 using namespace mgb;

 SymbolVar Network::add_conv(SymbolVar f, size_t output_channels,
                            KernSize kern_size, DType out_dtype, bool has_relu,
                            Stride stride, Padding padding) {
    static int weight_idx = 0;
    static int bias_idx = 0;

    size_t input_channels = f.node()->shape()[1];
    auto weight = add_cvar(
            ssprintf("w%d", weight_idx).c_str(),
            {output_channels, input_channels, kern_size[0], kern_size[1]});
    auto bias = add_cvar(ssprintf("b%d", bias_idx).c_str(),
                         {1, output_channels, 1, 1});
    if (out_dtype.category() == DTypeCategory::QUANTIZED) {
        weight = add_type_cvt(weight, out_dtype);
        bias = add_type_cvt(bias, dtype::QuantizedS32{1.f});
    }
    opr::ConvBias::Param param;
    param.stride_h = stride[0], param.stride_w = stride[1];
    param.pad_h = padding[0], param.pad_w = padding[1];
    if (has_relu) {
        param.nonlineMode = opr::ConvBias::Param::NonlineMode::RELU;
    } else {
        param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
    }

    auto conv = opr::ConvBias::make(f, weight, bias, param, {},
                                    OperatorNodeConfig{out_dtype});
    weight_idx++;
    bias_idx++;
    return conv;
 }

 SymbolVar Network::add_deconv(SymbolVar f, size_t ratio, size_t output_channels,
                              DType out_dtype) {
    static int weight_idx = 0;
    size_t kernel = ratio * 2 - ratio % 2;
    size_t pad = ratio / 2;

    size_t input_channels = f.node()->shape()[1];
    auto weight = add_cvar(ssprintf("w%d", weight_idx).c_str(),
                           {input_channels, output_channels, kernel, kernel});

    if (out_dtype.category() == DTypeCategory::QUANTIZED) {
        weight = add_type_cvt(weight, out_dtype);
    }
    opr::ConvolutionBackwardData::Param param;
    param.stride_h = param.stride_w = ratio;
    param.pad_h = param.pad_w = pad;

    auto deconv = opr::ConvolutionBackwardData::make(
            weight, f, param, {}, OperatorNodeConfig{out_dtype});
    weight_idx++;
    return deconv;
 }

 SymbolVar Network::add_elemwise(const SymbolVarArray inps, DType out_dtype,
                                opr::Elemwise::Param::Mode mode) {
    using ElemMode = opr::Elemwise::Param::Mode;
    using MultiMode = opr::ElemwiseMultiType::Param::Mode;
    static const ThinHashMap<ElemMode, MultiMode> map = {
            {ElemMode::ADD, MultiMode::QADD},
            {ElemMode::FUSE_ADD_RELU, MultiMode::QFUSE_ADD_RELU}};
    if (out_dtype.category() == DTypeCategory::QUANTIZED) {
        MultiMode alter_mode = map.at(mode);
        return opr::ElemwiseMultiType::make(inps, {alter_mode},
                                            OperatorNodeConfig{out_dtype});
    } else {
        return opr::Elemwise::make(inps, mode);
    }
 }

 SymbolVar Network::add_pooling(SymbolVar f, Window window, Stride stride,
                               Padding padding,
                               opr::Pooling::Param::Mode mode) {
    opr::Pooling::Param param;
    param.window_h = window[0], param.window_w = window[1];
    param.stride_h = stride[0], param.stride_w = stride[1];
    param.pad_h = padding[0], param.pad_w = padding[1];
    param.mode = mode;
    return opr::Pooling::make(f, param);
 }

 SymbolVar Network::add_type_cvt(SymbolVar f, DType out_dtype) {
    return opr::TypeCvt::make(f, out_dtype);
 }

 SymbolVar mgb::create_block(Network& network, SymbolVar f_in, size_t stride,
                            size_t num_outputs1, bool has_proj,
                            DType out_dtype) {
    auto proj = f_in;
    if (has_proj) {
        proj = network.add_conv(f_in, num_outputs1, {1, 1}, out_dtype, false,
                                {stride, stride});
    }

    auto f = network.add_conv(f_in, num_outputs1, {3, 3}, out_dtype, true,
                              {stride, stride}, {1, 1});

    f = network.add_conv(f, num_outputs1, {3, 3}, out_dtype, true, {1, 1},
                         {1, 1});

    f = network.add_elemwise({f, proj}, out_dtype,
                             opr::Elemwise::Mode::FUSE_ADD_RELU);
    return f;
 }

 SymbolVar mgb::make_resnet18(Network& network, size_t batch, DType out_dtype) {
    auto data = network.add_var("data", {batch, 4, 224, 224});
    if (out_dtype.category() == DTypeCategory::QUANTIZED)
        data = network.add_type_cvt(data, dtype::QuantizedS8{1.f});
    auto first = out_dtype;
    if (out_dtype.category() == DTypeCategory::QUANTIZED)
        first = dtype::QuantizedS8{1.f};
    auto f = network.add_conv(data, 64, {7, 7}, first, true, {2, 2}, {3, 3});
    if (out_dtype.enumv() == DTypeEnum::QuantizedS4 ||
        out_dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        f = network.add_type_cvt(f, out_dtype);
    }
    f = network.add_pooling(f, {3, 3}, {2, 2}, {1, 1});

    using Vector = SmallVector<size_t, 4>;
    Vector stages = {2, 2, 2, 2};
    Vector mid_outputs = {64, 128, 256, 512};
    Vector enable_stride = {0, 1, 1, 1};
    for (size_t i = 0; i < 4; ++i) {
        auto s = stages[i];
        auto o = mid_outputs[i];
        auto es = enable_stride[i];
        for (size_t j = 0; j < s; ++j) {
            size_t stride = !es || j > 0 ? 1 : 2;
            bool has_proj = j > 0 ? false : true;
            f = create_block(network, f, stride, o, has_proj, out_dtype);
        }
    }
    f = network.add_pooling(f, {7, 7}, {7, 7}, {0, 0},
                            opr::Pooling::Param::Mode::AVERAGE);

    f = network.add_type_cvt(f, dtype::Float32());
    return f;
 }

 namespace {
 SymbolVarArray make_pyramids(Network& network, size_t batch, DType out_dtype) {
    SymbolVarArray pyramids;
    auto data = network.add_var("data", {batch, 3, 256, 256});
    data = data + (-128.f);
    if (out_dtype.category() == DTypeCategory::QUANTIZED)
        data = network.add_type_cvt(data, dtype::QuantizedS8{1.f});
    auto first = out_dtype;
    if (out_dtype.category() == DTypeCategory::QUANTIZED)
        first = dtype::QuantizedS8{1.f};
    auto f = network.add_conv(data, 16, {3, 3}, first, true, {2, 2}, {1, 1});
    f = network.add_conv(f, 16, {3, 3}, first, true, {1, 1}, {1, 1});
    f = network.add_conv(f, 32, {3, 3}, first, true, {2, 2}, {1, 1});
    if (out_dtype.enumv() == DTypeEnum::QuantizedS4 ||
        out_dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        f = network.add_type_cvt(f, out_dtype);
    }

    using Vector = SmallVector<size_t, 4>;
    Vector stages = {3, 6, 6, 3};
    Vector mid_outputs = {32, 64, 128, 256};
    Vector enable_stride = {0, 1, 1, 1};
    for (size_t i = 0; i < 4; ++i) {
        auto s = stages[i];
        auto o = mid_outputs[i];
        auto es = enable_stride[i];
        for (size_t j = 0; j < s; ++j) {
            size_t stride = !es || j > 0 ? 1 : 2;
            bool has_proj = j > 0 ? false : true;
            f = create_block(network, f, stride, o, has_proj, out_dtype);
        }
        pyramids.push_back(f);
    }

    for (size_t i = 0; i < pyramids.size(); ++i) {
        pyramids[i] = network.add_type_cvt(pyramids[i], first);
    }
    return pyramids;
 }

 SymbolVarArray fusion_pyramids_feature(Network& network,
                                       SymbolVarArray pyramids,
                                       size_t fpn_conv_channels) {
    bool touch = false;
    SymbolVar x;
    SymbolVarArray fpn;
    for (int i = 5; i >= 3; --i) {
        auto f = network.add_conv(pyramids[i - 2], fpn_conv_channels, {1, 1},
                                  dtype::QuantizedS8{1.f}, false, {1, 1},
                                  {0, 0});
        if (!touch) {
            x = f;
        } else {
            x = network.add_deconv(x, 2, 16, dtype::QuantizedS8{1.f});
            x = network.add_elemwise({x, f}, dtype::QuantizedS8{1.f},
                                     opr::Elemwise::Mode::ADD);
        }
        fpn.push_back(x);
    }

    x = fpn[0];
    for (int i = 6; i < 8; ++i) {
        x = network.add_conv(x, fpn_conv_channels, {3, 3},
                             dtype::QuantizedS8{1.f}, true, {2, 2}, {1, 1});
    }
    return fpn;
 }
 }  // namespace

 SymbolVarArray mgb::make_det(Network& network, size_t batch, DType out_dtype) {
    SymbolVarArray outputs;
    auto pyramids = make_pyramids(network, batch, out_dtype);
    auto fpn_hv = fusion_pyramids_feature(network, pyramids, 16);
    auto fpn_plate = fusion_pyramids_feature(network, pyramids, 16);
    outputs.insert(outputs.end(), fpn_hv.begin(), fpn_hv.end());
    outputs.insert(outputs.end(), fpn_plate.begin(), fpn_plate.end());
    return outputs;
 }

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/test/network.h
+++ b/src/gopt/test/network.h
@@ -0,0 +1,77 @@
 /**
 * \file src/gopt/test/network.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once

 #include "megbrain/test/helper.h"

 #include "megbrain/gopt/framework.h"
 #include "megbrain/opr/basic_arith_wrapper.h"
 #include "megbrain/opr/blas.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/opr/tensor_gen.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/opr/utility.h"

 namespace mgb {
 class Network {
 private:
    HostTensorGenerator<> gen;
    CompNode cn;

 public:
    std::shared_ptr<ComputingGraph> graph = ComputingGraph::make();
    Network(CompNode cn_) : cn{cn_} {}
    ~Network() noexcept = default;
    using KernSize = SmallVector<size_t, 2>;
    using Stride = SmallVector<size_t, 2>;
    using Padding = SmallVector<size_t, 2>;
    SymbolVar add_var(const char* name, const TensorShape& shp = {1}) {
        return opr::Host2DeviceCopy::make(*graph, gen(shp), cn).rename(name);
    }
    SymbolVar add_cvar(const char* name, const TensorShape& shp = {1}) {
        return opr::SharedDeviceTensor::make(*graph, *gen(shp), cn)
                .rename(name);
    }

    SymbolVar add_conv(SymbolVar f, size_t output_channels, KernSize kern_size,
                       DType out_dtype = dtype::Float32(), bool has_relu = true,
                       Stride stride = {1, 1}, Padding padding = {0, 0});
    SymbolVar add_deconv(SymbolVar f, size_t ratio, size_t output_channels,
                         DType out_dtype);
    SymbolVar add_elemwise(
            const SymbolVarArray inps, DType out_dtype = dtype::Float32(),
            opr::Elemwise::Param::Mode mode = opr::Elemwise::Param::Mode::ADD);
    using Window = SmallVector<size_t, 2>;
    SymbolVar add_pooling(
            SymbolVar f, Window window, Stride stride = {1, 1},
            Padding padding = {0, 0},
            opr::Pooling::Param::Mode mode = opr::Pooling::Param::Mode::MAX);
    SymbolVar add_type_cvt(SymbolVar f, DType out_dtype = dtype::Float32());
 };

 SymbolVar create_block(Network& network, SymbolVar f, size_t stride,
                       size_t num_outputs1, bool has_proj = false,
                       DType out_dtype = dtype::Float32());

 SymbolVar make_resnet18(Network& network, size_t batch = 16,
                        DType out_dtype = dtype::Float32());

 SymbolVarArray make_det(Network& network, size_t batch = 16,
                        DType out_dtype = dtype::Float32());

 }  // namespace mgb

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}