feat(mgb/gopt): add profile impl for global layout transform pass

GitOrigin-RevId: 8ef62baf79
3 years ago · c14e5719f8
--- a/dnn/src/aarch64/relayout/opr_impl.cpp
+++ b/dnn/src/aarch64/relayout/opr_impl.cpp
@@ -166,6 +166,13 @@ void aarch64::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
    TensorND src = src0, dst = dst0;
    check_layout_and_canonize(src.layout, dst.layout);

    // FIXME: optimize for lowbit cases
    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
        src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
        return;
    }

    relayout::TransposeParam trans_param;
    bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
    if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
--- a/dnn/src/armv7/relayout/opr_impl.cpp
+++ b/dnn/src/armv7/relayout/opr_impl.cpp
@@ -134,6 +134,13 @@ void armv7::RelayoutForwardImpl::exec(_megdnn_tensor_in src0,
    TensorND src = src0, dst = dst0;
    check_layout_and_canonize(src.layout, dst.layout);

    // FIXME: optimize for lowbit cases
    if (src.layout.dtype.enumv() == DTypeEnum::QuantizedS4 ||
        src.layout.dtype.enumv() == DTypeEnum::Quantized4Asymm) {
        fallback::RelayoutForwardImpl::exec(src0, dst0, src_handle);
        return;
    }

    relayout::TransposeParam trans_param;
    bool trans = relayout::is_transpose(src.layout, dst.layout, trans_param);
    if (trans && trans_param.c == 1 && src0.layout.dtype.size() == 1) {
--- a/src/gopt/impl/opr_format_modifier.cpp
+++ b/src/gopt/impl/opr_format_modifier.cpp
@@ -0,0 +1,313 @@
 /**
 * \file src/gopt/impl/opr_format_modifier.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./opr_format_modifier.h"
 #include "megbrain/opr/dnn/convolution.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/serialization/sereg.h"

 #include "midout.h"
 MIDOUT_DECL(megbrain_opr_format_modifier)
 #define MIDOUT_B(...) MIDOUT_BEGIN(megbrain_opr_format_modifier, __VA_ARGS__) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();

 using namespace mgb;
 using namespace opr;

 namespace {
 template <class MegDNNConv = megdnn::Convolution>
 struct MakeConvCaller2 {
    template <typename Opr>
    static VarNode* make(const cg::VarNodeArray& inputs,
                         const typename MegDNNConv::Param& param,
                         const megdnn::param::ExecutionPolicy& execution_policy,
                         const OperatorNodeConfig& config) {
        if (inputs.size() == 2) {
            return Opr::make(inputs[0], inputs[1], param, execution_policy,
                             config)
                    .node();
        }
        return nullptr;
    }
 };

 template <class MegDNNConv = megdnn::Convolution>
 struct MakeConvCaller3 {
    template <typename Opr>
    static VarNode* make(const cg::VarNodeArray& inputs,
                         const typename MegDNNConv::Param& param,
                         const megdnn::param::ExecutionPolicy& execution_policy,
                         const OperatorNodeConfig& config) {
        if (inputs.size() == 3) {
            return Opr::make(inputs[0], inputs[1], inputs[2], param,
                             execution_policy, config)
                    .node();
        }
        return nullptr;
    }
 };

 template <class MegDNNConv = megdnn::Convolution>
 struct MakeConvCaller4 {
    template <typename Opr>
    static VarNode* make(const cg::VarNodeArray& inputs,
                         const typename MegDNNConv::Param& param,
                         const megdnn::param::ExecutionPolicy& execution_policy,
                         const OperatorNodeConfig& config) {
        if (inputs.size() == 4) {
            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3], param,
                             execution_policy, config)
                    .node();
        }
        return nullptr;
    }
 };

 template <class MegDNNConv = megdnn::Convolution>
 struct MakeConvCaller5 {
    template <typename Opr>
    static VarNode* make(const cg::VarNodeArray& inputs,
                         const typename MegDNNConv::Param& param,
                         const megdnn::param::ExecutionPolicy& execution_policy,
                         const OperatorNodeConfig& config) {
        if (inputs.size() == 5) {
            return Opr::make(inputs[0], inputs[1], inputs[2], inputs[3],
                             inputs[4], param, execution_policy, config)
                    .node();
        }
        return nullptr;
    }
 };

 template <class MegDNNConv = megdnn::Convolution>
 struct MakeConvCallerEmpty {
    template <typename Opr>
    static VarNode* make(const cg::VarNodeArray&,
                         const typename MegDNNConv::Param&,
                         const megdnn::param::ExecutionPolicy&,
                         const OperatorNodeConfig&) {
        return nullptr;
    }
 };

 template <class Opr, class Maker0, class MegDNNConv,
          class Maker1 = MakeConvCallerEmpty<MegDNNConv>,
          class Maker2 = MakeConvCallerEmpty<MegDNNConv>,
          typename ConvParam = megdnn::param::Convolution>
 struct ConvMakerImpl {
    static VarNode* make(const cg::VarNodeArray& inputs, const ConvParam& param,
                         const megdnn::param::ExecutionPolicy& execution_policy,
                         const OperatorNodeConfig& config) {
        VarNode* ret = Maker0::template make<Opr>(inputs, param,
                                                  execution_policy, config);
        if (!ret) {
            ret = Maker1::template make<Opr>(inputs, param, execution_policy,
                                             config);
        }
        if (!ret) {
            ret = Maker2::template make<Opr>(inputs, param, execution_policy,
                                             config);
        }
        mgb_assert(ret);
        return ret;
    }
 };

 template <typename Opr>
 struct ConvMaker;

 template <>
 struct ConvMaker<opr::Convolution>
        : public ConvMakerImpl<opr::Convolution,
                               MakeConvCaller2<megdnn::Convolution>,
                               megdnn::Convolution> {};
 template <>
 struct ConvMaker<opr::ConvolutionBackwardData>
        : public ConvMakerImpl<opr::ConvolutionBackwardData,
                               MakeConvCaller2<megdnn::Convolution>,
                               megdnn::Convolution,
                               MakeConvCaller3<megdnn::Convolution>> {};

 template <>
 struct ConvMaker<opr::ConvBiasForward>
        : public ConvMakerImpl<opr::ConvBiasForward,
                               MakeConvCaller2<megdnn::ConvBiasForward>,
                               megdnn::ConvBiasForward,
                               MakeConvCaller3<megdnn::ConvBiasForward>,
                               MakeConvCaller4<megdnn::ConvBiasForward>,
                               megdnn::param::ConvBias> {};
 template <>
 struct ConvMaker<opr::BatchConvBiasForward>
        : public ConvMakerImpl<opr::BatchConvBiasForward,
                               MakeConvCaller2<megdnn::BatchConvBiasForward>,
                               megdnn::BatchConvBiasForward,
                               MakeConvCaller3<megdnn::BatchConvBiasForward>,
                               MakeConvCaller4<megdnn::BatchConvBiasForward>,
                               megdnn::param::BatchConvBias> {};

 #if 0
 #include "../../opr/impl/internal/invoke.h"
 template <typename Opr>
 struct MultiAlgoOprTrait;

 #define APPLY(statement, ...)                                  \
    mgb::apply([&](const auto&... args) { return statement; }, \
               std::tuple_cat(__VA_ARGS__))

 #define INST(_Opr)                                                          \
    template <>                                                             \
    struct MultiAlgoOprTrait<_Opr> {                                        \
        static constexpr bool has_algo = true;                              \
        using MegDNNOpr = megdnn::_Opr;                                     \
        static constexpr int arity = OprArityTrait<MegDNNOpr>::arity;       \
        using FixedTensorLayouts = std::array<TensorLayout, arity>;         \
        static bool has_available_algo(const VarNodeArray& i,               \
                                       const cg::OperatorNodeBase* opr_) {  \
            MIDOUT_B(midout_iv(MGB_HASH_STR(#_Opr)),                        \
                     midout_iv(MGB_HASH_STR("has_available_algo")))         \
            auto&& opr = opr_->cast_final_safe<_Opr>();                     \
            auto&& megdnn_opr =                                             \
                    reinterpret_cast<MegDNNOpr*>(opr.megdnn_opr());         \
            FixedTensorLayouts array_layouts;                               \
            size_t in = i.size() - 1;                                       \
            for (size_t idx = 0; idx < in; idx++) {                         \
                const auto& v = i[idx];                                     \
                array_layouts[idx] =                                        \
                        TensorLayout{v->shape(), v->dtype(), v->format()};  \
            }                                                               \
            const auto& v = i[in];                                          \
            array_layouts[arity - 1] =                                      \
                    TensorLayout{v->shape(), v->dtype(), v->format()};      \
            return APPLY(::megdnn::has_available_algo(megdnn_opr, args...), \
                         array_layouts);                                    \
            MIDOUT_E                                                        \
        }                                                                   \
    };
 INST(Convolution)
 INST(ConvBiasForward)
 INST(ConvolutionBackwardData)
 INST(PoolingForward)
 #undef APPLY
 #undef INST
 #endif
 }  // namespace

 namespace mgb {
 namespace gopt {
 namespace intl {

 template <typename Opr>
 struct OprFormatModifier;

 #define INST(_Opr)                                                         \
    template <>                                                            \
    struct OprFormatModifier<_Opr> {                                       \
        using OprFormat = typename _Opr::Param::Format;                    \
        static VarNode* make(OprFormat opr_format, const VarNodeArray& i,  \
                             const cg::OperatorNodeBase* opr_) {           \
            MIDOUT_B(_Opr)                                                 \
            auto&& opr = opr_->cast_final_safe<_Opr>();                    \
            auto param = opr.param();                                      \
            param.format = opr_format;                                     \
            return ConvMaker<_Opr>::make(i, param, opr.execution_policy(), \
                                         opr.config());                    \
            MIDOUT_E                                                       \
        }                                                                  \
    };
 INST(Convolution);
 INST(ConvBiasForward);
 INST(ConvolutionBackwardData);
 INST(BatchConvBiasForward);
 #undef INST

 template <>
 struct OprFormatModifier<WarpPerspective> {
    using Opr = opr::WarpPerspective;
    using OprFormat = typename Opr::Param::Format;
    static VarNode* make(OprFormat opr_format, const VarNodeArray& i,
                         const cg::OperatorNodeBase* opr_) {
        MIDOUT_B(Opr)
        auto&& opr = opr_->cast_final_safe<Opr>();
        auto param = opr.param();
        param.format = opr_format;
        if (i.size() == 3) {
            return Opr::make(i[0], i[1], i[2], param, opr.config()).node();
        } else {
            mgb_assert(i.size() == 4);
            return Opr::make(i[0], i[1], i[2], i[3], param, opr.config())
                    .node();
        }
        MIDOUT_E
    }
 };

 #define INST(_Opr, _arity)                                                \
    template <>                                                           \
    struct OprFormatModifier<_Opr> {                                      \
        using OprFormat = typename _Opr::Param::Format;                   \
        static VarNode* make(OprFormat opr_format, const VarNodeArray& i, \
                             const cg::OperatorNodeBase* opr_) {          \
            MIDOUT_B(_Opr)                                                \
            auto&& opr = opr_->cast_final_safe<_Opr>();                   \
            auto param = opr.param();                                     \
            param.format = opr_format;                                    \
            return serialization::OprMaker<_Opr, _arity>::make(           \
                           param, i, *i[0]->owner_graph(), opr.config())  \
                    ->output(0);                                          \
            MIDOUT_E                                                      \
        }                                                                 \
    };
 INST(PoolingForward, 1);
 INST(Resize, 2);
 #undef INST

 VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
                           const VarNodeArray& i,
                           const cg::OperatorNodeBase* opr) {
 #define cb(_Opr)                                                  \
    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                \
        return OprFormatModifier<_Opr>::make(opr_format, i, opr); \
    } else
    FOREACH_FORMAT_AWARE_OPR(cb) {
        mgb_throw(InternalError, "invalid format aware operator(got:%s)",
                  opr->dyn_typeinfo()->name);
    }
 #undef cb
 }

 #if 0
 bool has_available_algo(const VarNodeArray& i,
                        const cg::OperatorNodeBase* opr) {
 #define cb(_Opr)                                                    \
    if (opr->dyn_typeinfo() == _Opr::typeinfo()) {                  \
        MGB_MARK_USED_VAR(MultiAlgoOprTrait<_Opr>::has_algo);       \
        VarNodeArray _ = i;                                         \
        _.emplace_back(opr->output(0));                             \
        return MultiAlgoOprTrait<_Opr>::has_available_algo(_, opr); \
    } else
    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData)
            cb(PoolingForward) {
        mgb_throw(InternalError, "invalid multi-algo operator(got:%s)",
                  opr->dyn_typeinfo()->name);
    }
 }
 #endif

 }  // namespace intl
 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/opr_format_modifier.h
+++ b/src/gopt/impl/opr_format_modifier.h
@@ -0,0 +1,36 @@
 /**
 * \file src/gopt/impl/opr_format_modifier.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megbrain/graph.h"
 #include "megbrain/opr/dnn/convolution.h"

 namespace mgb {
 namespace gopt {
 namespace intl {

 #define FOREACH_FORMAT_AWARE_OPR(cb)                                \
    cb(Convolution) cb(ConvBiasForward) cb(ConvolutionBackwardData) \
            cb(PoolingForward) cb(WarpPerspective) cb(Resize)
 #if 0
 bool has_available_algo(const VarNodeArray& i, const cg::OperatorNodeBase* opr);
 #endif

 VarNode* modify_opr_format(opr::ConvBias::Param::Format opr_format,
                           const VarNodeArray& i,
                           const cg::OperatorNodeBase* opr);

 }  // namespace intl
 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/opr_tensor_formats_config.cpp
+++ b/src/gopt/impl/opr_tensor_formats_config.cpp
@@ -0,0 +1,582 @@
 /**
 * \file src/gopt/impl/opr_tensor_formats_config.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./utils.h"
 #include "megbrain/gopt/global_layout_transform.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"

 #include "midout.h"
 MIDOUT_DECL(megbrain_opr_tensor_formats_config)
 #define MIDOUT_B(...) \
    MIDOUT_BEGIN(megbrain_opr_tensor_formats_config, __VA_ARGS__) {
 #define MIDOUT_E \
    }            \
    MIDOUT_END();

 using namespace mgb;
 using namespace cg;
 using namespace gopt;
 using OprFormat = opr::ConvBias::Param::Format;

 namespace {
 template <typename Opr>
 struct ConvParamTrait;

 #define INST(_conv, _weight_idx, _bias_idx, _has_bias) \
    template <>                                        \
    struct ConvParamTrait<opr::_conv> {                \
        static constexpr int weight_idx = _weight_idx; \
        static constexpr int bias_idx = _bias_idx;     \
        static constexpr bool has_bias = _has_bias;    \
    }
 INST(ConvBias, 1, 2, true);
 INST(ConvolutionForward, 1, 0, false);
 INST(ConvolutionBackwardData, 0, 0, false);

 template <typename Opr, size_t weight_idx = ConvParamTrait<Opr>::weight_idx>
 static bool is_channel_wise_conv(const OperatorNodeBase* opr) {
    MGB_MARK_USED_VAR(ConvParamTrait<Opr>::has_bias);
    MGB_MARK_USED_VAR(ConvParamTrait<Opr>::bias_idx);
    auto&& conv = opr->cast_final_safe<Opr>();
    auto format = conv.param().format;
    auto weight = opr->input(weight_idx);
    auto weight_shp = weight->shape();
    if (conv.param().sparse == Opr::Param::Sparse::DENSE)
        return false;
    size_t ocpg, icpg;
    if (format == Opr::Param::Format::NCHW) {
        ocpg = weight_shp[1], icpg = weight_shp[2];
        return ocpg == 1 && icpg == 1;
    }
    return false;
 }

 template <OprFormat opr_format_>
 struct OprSingleInOutTensorFormatsDispatcherImpl;

 template <>
 struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW;
        config.input_dtypes = {opr->input(0)->dtype().enumv()};
        config.input_tensor_types = {TensorType::FEATURE};
        config.output_dtypes = {opr->output(0)->dtype().enumv()};
        config.input_tensor_formats = {TensorFormats::NCHW};
        config.output_tensor_formats = {TensorFormats::NCHW};
        return config;
    }
 };

 template <>
 struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW4> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW4;
        bool available = true;
        available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.input_dtypes = {opr->input(0)->dtype().enumv()};
        config.input_tensor_types = {TensorType::FEATURE};
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes = {opr->output(0)->dtype().enumv()};
        config.input_tensor_formats = {TensorFormats::NCHWc4};
        config.output_tensor_formats = {TensorFormats::NCHWc4};
        if (available)
            return config;
        return None;
    }
 };

 template <>
 struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::CHWN4> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::CHWN4;
        bool available = true;
        available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.input_dtypes = {opr->input(0)->dtype().enumv()};
        config.input_tensor_types = {TensorType::FEATURE};
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes = {opr->output(0)->dtype().enumv()};
        config.input_tensor_formats = {TensorFormats::CHWNc4};
        config.output_tensor_formats = {TensorFormats::CHWNc4};
        if (available)
            return config;
        return None;
    }
 };

 template <>
 struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW32> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW32;
        bool available = true;
        available &= opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.input_dtypes = {opr->input(0)->dtype().enumv()};
        config.input_tensor_types = {TensorType::FEATURE};
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes = {opr->output(0)->dtype().enumv()};
        config.input_tensor_formats = {TensorFormats::NCHWc32};
        config.output_tensor_formats = {TensorFormats::NCHWc32};
        if (available)
            return config;
        return None;
    }
 };

 template <>
 struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NHWC> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NHWC;
        bool available = true;
        available &=
                opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
                opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
        config.input_dtypes = {opr->input(0)->dtype().enumv()};
        config.input_tensor_types = {TensorType::FEATURE};
        available &= opr->output(0)->dtype().enumv() ==
                     opr->input(0)->dtype().enumv();
        config.output_dtypes = {opr->output(0)->dtype().enumv()};
        config.input_tensor_formats = {TensorFormats::NHWC};
        config.output_tensor_formats = {TensorFormats::NHWC};
        if (available)
            return config;
        return None;
    }
 };

 template <>
 struct OprSingleInOutTensorFormatsDispatcherImpl<OprFormat::NCHW64> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW64;
        bool available = true;
        available &=
                opr->input(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
                opr->input(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
        config.input_dtypes = {opr->input(0)->dtype().enumv()};
        config.input_tensor_types = {TensorType::FEATURE};
        available &= opr->output(0)->dtype().enumv() ==
                     opr->input(0)->dtype().enumv();
        config.output_dtypes = {opr->output(0)->dtype().enumv()};
        config.input_tensor_formats = {TensorFormats::NCHWc64};
        config.output_tensor_formats = {TensorFormats::NCHWc64};
        if (available)
            return config;
        return None;
    }
 };

 template <typename Opr, OprFormat opr_format_>
 struct ConvTensorFormatsDispatcherImpl;

 template <typename Opr>
 struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW;
        // setup dtypes
        for (size_t i = 0; i < opr->input().size(); ++i) {
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        // setup tensor formats
        if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
            config.input_tensor_formats = {
                    TensorFormats::NCHW, TensorFormats::NCHW,
                    TensorFormats::NCHW, TensorFormats::NCHW};
        } else {
            mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
            if (is_channel_wise_conv<Opr>(opr)) {
                config.input_tensor_formats = {
                        TensorFormats::NCHW, TensorFormats::C11RS,
                        TensorFormats::NCHW, TensorFormats::NCHW};
            } else {
                config.input_tensor_formats = {
                        TensorFormats::NCHW, TensorFormats::GKCRS,
                        TensorFormats::NCHW, TensorFormats::NCHW};
            }
        }
        config.output_tensor_formats = {TensorFormats::NCHW};
        return config;
    }
 };

 template <typename Opr>
 struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NHWC> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NHWC;
        bool available = true;
        for (size_t i = 0; i < opr->input().size(); ++i) {
            if (i == 2)
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS32;
            else
                available &= opr->input(i)->dtype().enumv() ==
                                     DTypeEnum::Quantized4Asymm ||
                             opr->input(i)->dtype().enumv() ==
                                     DTypeEnum::QuantizedS4;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &=
                opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
                opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        config.input_tensor_formats = {TensorFormats::NHWC, TensorFormats::NHWC,
                                       TensorFormats::NHWC,
                                       TensorFormats::NHWC};
        config.output_tensor_formats = {TensorFormats::NHWC};
        if (available)
            return config;
        return None;
    }
 };

 template <typename Opr>
 struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW4> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW4;
        bool available = true;
        // setup dtypes
        for (size_t i = 0; i < opr->input().size(); ++i) {
            if (i == 2)
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS32;
            else
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS8;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        // setup tensor formats
        if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
            config.input_tensor_formats = {
                    TensorFormats::NCHWc4, TensorFormats::NCHWc4,
                    TensorFormats::NCHWc4, TensorFormats::NCHWc4};
        } else {
            mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
            if (is_channel_wise_conv<Opr>(opr)) {
                config.input_tensor_formats = {
                        TensorFormats::NCHWc4, TensorFormats::C11RSc4,
                        TensorFormats::NCHWc4, TensorFormats::NCHWc4};
            } else {
                config.input_tensor_formats = {
                        TensorFormats::NCHWc4, TensorFormats::GKCRSc4,
                        TensorFormats::NCHWc4, TensorFormats::NCHWc4};
            }
        }
        config.output_tensor_formats = {TensorFormats::NCHWc4};
        if (available)
            return config;
        return None;
    }
 };

 template <typename Opr>
 struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW32> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW32;
        bool available = true;
        for (size_t i = 0; i < opr->input().size(); ++i) {
            if (i == 2)
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS32;
            else
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS8;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        config.input_tensor_formats = {
                TensorFormats::NCHWc32, TensorFormats::NCHWc32,
                TensorFormats::NCHWc32, TensorFormats::NCHWc32};
        config.output_tensor_formats = {TensorFormats::NCHWc32};
        if (available)
            return config;
        return None;
    }
 };

 template <typename Opr>
 struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::NCHW64> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW64;
        bool available = true;
        for (size_t i = 0; i < opr->input().size(); ++i) {
            if (i == 2)
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS32;
            else
                available &= opr->input(i)->dtype().enumv() ==
                                     DTypeEnum::Quantized4Asymm ||
                             opr->input(i)->dtype().enumv() ==
                                     DTypeEnum::QuantizedS4;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &=
                opr->output(0)->dtype().enumv() == DTypeEnum::Quantized4Asymm ||
                opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS4;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        config.input_tensor_formats = {
                TensorFormats::NCHWc64, TensorFormats::NCHWc64,
                TensorFormats::NCHWc64, TensorFormats::NCHWc64};
        config.output_tensor_formats = {TensorFormats::NCHWc64};
        if (available)
            return config;
        return None;
    }
 };

 template <typename Opr>
 struct ConvTensorFormatsDispatcherImpl<Opr, OprFormat::CHWN4> {
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::CHWN4;
        bool available = true;
        for (size_t i = 0; i < opr->input().size(); ++i) {
            if (i == 2)
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS32;
            else
                available &= opr->input(i)->dtype().enumv() ==
                             DTypeEnum::QuantizedS8;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 1 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == Opr::Param::Sparse::DENSE;
        config.input_tensor_formats = {
                TensorFormats::CHWNc4, TensorFormats::CHWNc4,
                TensorFormats::CHWNc4, TensorFormats::CHWNc4};
        config.output_tensor_formats = {TensorFormats::CHWNc4};
        if (available)
            return config;
        return None;
    }
 };

 template <>
 struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
                                       OprFormat::NCHW> {
    using Opr = opr::ConvolutionBackwardData;
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW;
        // setup dtypes
        for (size_t i = 0; i < opr->input().size(); ++i) {
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        // setup tensor formats
        if (conv.param().sparse == Opr::Param::Sparse::DENSE) {
            config.input_tensor_formats = {
                    TensorFormats::NCHW, TensorFormats::NCHW,
                    TensorFormats::NCHW, TensorFormats::NCHW};
        } else {
            mgb_assert(conv.param().sparse == Opr::Param::Sparse::GROUP);
            if (is_channel_wise_conv<Opr>(opr)) {
                config.input_tensor_formats = {
                        TensorFormats::C11RS, TensorFormats::NCHW,
                        TensorFormats::NCHW, TensorFormats::NCHW};
            } else {
                config.input_tensor_formats = {
                        TensorFormats::GKCRS, TensorFormats::NCHW,
                        TensorFormats::NCHW, TensorFormats::NCHW};
            }
        }
        config.output_tensor_formats = {TensorFormats::NCHW};
        return config;
    }
 };

 template <>
 struct ConvTensorFormatsDispatcherImpl<opr::ConvolutionBackwardData,
                                       OprFormat::NCHW4> {
    using Opr = opr::ConvolutionBackwardData;
    static Maybe<OprTensorFormatsConfiguration> dispatch(
            const OperatorNodeBase* opr) {
        const auto& conv = opr->cast_final_safe<Opr>();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = OprFormat::NCHW4;
        bool available = true;
        for (size_t i = 0; i < opr->input().size(); ++i) {
            available &=
                    opr->input(i)->dtype().enumv() == DTypeEnum::QuantizedS8;
            config.input_dtypes.emplace_back(opr->input(i)->dtype().enumv());
            TensorType tensor_type =
                    i == 0 ? TensorType::WEIGHT : TensorType::FEATURE;
            config.input_tensor_types.emplace_back(tensor_type);
        }
        available &= opr->output(0)->dtype().enumv() == DTypeEnum::QuantizedS8;
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        available &= conv.param().sparse == opr::ConvBias::Param::Sparse::DENSE;
        config.input_tensor_formats = {
                TensorFormats::NCHWc4, TensorFormats::NCHWc4,
                TensorFormats::NCHWc4, TensorFormats::NCHWc4};
        config.output_tensor_formats = {TensorFormats::NCHWc4};
        if (available)
            return config;
        return None;
    }
 };

 struct StaticData {
    struct KeyHash {
        size_t operator()(const std::pair<Typeinfo*, OprFormat>& val) const {
            size_t h1 = mgb::hash<Typeinfo*>(val.first);
            size_t h2 =
                    std::hash<uint32_t>()(static_cast<uint32_t>(val.second));
            return mgb::hash_pair_combine(h1, h2);
        }
    };
    using OprTensorFormatsDispatcher =
            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
    std::unordered_map<std::pair<Typeinfo*, OprFormat>,
                       OprTensorFormatsDispatcher, KeyHash>
            typefmt2dispatcher;
    StaticData();
 };

 StaticData::StaticData() {
 #define OPR_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt)                   \
    typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] =  \
            [](const OperatorNodeBase* opr) {                       \
                MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt))     \
                return ConvTensorFormatsDispatcherImpl<             \
                        opr::_Opr, OprFormat::_fmt>::dispatch(opr); \
                MIDOUT_E                                            \
            }

 #define OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(_Opr, _fmt)    \
    typefmt2dispatcher[{opr::_Opr::typeinfo(), OprFormat::_fmt}] = \
            [](const OperatorNodeBase* opr) {                      \
                MIDOUT_B(opr::_Opr, midout_iv(OprFormat::_fmt))    \
                return OprSingleInOutTensorFormatsDispatcherImpl<  \
                        OprFormat::_fmt>::dispatch(opr);           \
                MIDOUT_E                                           \
            }

    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NHWC);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW4);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, CHWN4);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW32);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvBias, NCHW64);

    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionForward, NCHW4);

    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW);
    OPR_TENSOR_FORMATS_CONFIG_REG(ConvolutionBackwardData, NCHW4);

    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NHWC);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW4);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(WarpPerspectiveForward, NCHW64);

    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NHWC);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW4);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, CHWN4);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW32);
    OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG(PoolingForward, NCHW64);

 #undef OPR_TENSOR_FORMATS_CONFIG_REG
 #undef OPR_SINGLE_IN_OUT_TENSOR_FORMATS_CONFIG_REG
 }

 StaticData& static_data() {
    static StaticData inst;
    return inst;
 }
 }  // namespace

 OprTensorFormatsConfiguration::OprTensorFormatsDispatcher*
 OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
        Typeinfo* type, OprFormat opr_format) {
    auto&& typefmt2dispatcher = static_data().typefmt2dispatcher;
    auto iter = typefmt2dispatcher.find(std::make_pair(type, opr_format));
    mgb_assert(iter != typefmt2dispatcher.end(),
               "cannot find OprTensorFormatsDispatcher for opr type(%s) and "
               "opr format(%s)",
               type->name, opr_format_to_string(opr_format));
    return &iter->second;
 }

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/profiler_impl.cpp
+++ b/src/gopt/impl/profiler_impl.cpp
@@ -0,0 +1,527 @@
 /**
 * \file src/gopt/impl/profiler_impl.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./opr_format_modifier.h"
 #include "./utils.h"
 #include "megbrain/gopt/framework.h"
 #include "megbrain/gopt/global_layout_transform.h"
 #include "megbrain/graph/event.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/io.h"
 #include "megbrain/plugin/base.h"
 #include "megbrain/serialization/sereg.h"

 using namespace mgb;
 using namespace cg;
 using namespace opr;
 using namespace gopt;
 using ReformatKey = ReformatManager::ReformatKey;

 namespace {
 using OprFormat = Problem::OprFormat;
 OprFormat tensor_formats_to_opr_format(TensorFormats tensor_format) {
    switch (tensor_format) {
        case TensorFormats::NCHW:
            return OprFormat::NCHW;
        case TensorFormats::NCHWc4:
            return OprFormat::NCHW4;
        case TensorFormats::NCHWc8:
            return OprFormat::NCHW8;
        case TensorFormats::NCHWc32:
            return OprFormat::NCHW32;
        case TensorFormats::NCHWc64:
            return OprFormat::NCHW64;
        case TensorFormats::NHWC:
            return OprFormat::NHWC;
        case TensorFormats::CHWNc4:
            return OprFormat::CHWN4;
        default:
            mgb_throw(MegBrainError, "tensor format(%u) is not supported",
                      static_cast<uint32_t>(tensor_format));
    }
 }

 class GraphPartitionProfiler final : public PluginBase {
    using CompNodeEventPtr = std::unique_ptr<CompNode::Event>;

 public:
    using OprFilter = thin_function<bool(OperatorNodeBase*)>;
    struct OprKernEvent {
        CompNodeEventPtr start, end;
    };
    GraphPartitionProfiler(ComputingGraph* graph, OprFilter opr_filter);
    ~GraphPartitionProfiler() noexcept;
    float duration_in_usec() const;

 private:
    void record_event(CompNodeEventPtr& dest, CompNode cn) {
        if (dest == nullptr)
            dest = cn.create_event(CompNode::Event::NEED_TIMER);
        dest->record();
    }
    ThinHashMap<OperatorNodeBase*, OprKernEvent> m_kern_event;
    OprFilter m_opr_filter;
 };

 GraphPartitionProfiler::GraphPartitionProfiler(ComputingGraph* graph,
                                               OprFilter opr_filter)
        : PluginBase(graph), m_opr_filter(opr_filter) {
    using namespace event;
    auto on_before_kern = [this](BeforeKernel const& event) {
        if (!m_opr_filter(event.opr))
            return;
        auto evptr = &m_kern_event[event.opr].start;
        record_event(*evptr, event.comp_node);
    };
    auto on_after_kern = [this](AfterKernel const& event) {
        if (!m_opr_filter(event.opr))
            return;
        auto evptr = &m_kern_event[event.opr].end;
        record_event(*evptr, event.comp_node);
    };
    auto&& ev = graph->event();
    add_event_handler(ev.register_receiver<BeforeKernel>(on_before_kern));
    add_event_handler(ev.register_receiver<AfterKernel>(on_after_kern));
 }

 GraphPartitionProfiler::~GraphPartitionProfiler() noexcept {
    auto wait = [](const CompNodeEventPtr& ev) {
        if (ev)
            ev->host_wait();
    };
    for (auto&& i : m_kern_event) {
        wait(i.second.start);
        wait(i.second.end);
    }
 }

 float GraphPartitionProfiler::duration_in_usec() const {
    float device_duration = 0.f;
    for (auto&& kern_ev : m_kern_event) {
        auto&& event = kern_ev.second;
        event.end->host_wait();
        device_duration += 1e6 * event.start->elapsed_time_until(*event.end);
    }
    return device_duration;
 }

 /*!
 * \brief An operator that indicates its input var node is contiguous
 */
 // clang-format off
 MGB_DEFINE_OPR_CLASS(MarkInputContiguous, SingleCNOperatorNodeBase) //{
    void scn_do_execute() override {};
    void init_output_static_infer_desc() override;
    void add_input_layout_constraint() override {
        input(0)->add_layout_constraint_contiguous();
    }
 public:
    MarkInputContiguous(VarNode* input, const OperatorNodeConfig& config);
    static SymbolVar make(SymbolVar input, const OperatorNodeConfig& config = {});
 };
 // clang-format on

 MGB_DYN_TYPE_OBJ_FINAL_IMPL(MarkInputContiguous);

 MarkInputContiguous::MarkInputContiguous(VarNode* input,
                                         const OperatorNodeConfig& config)
        : Super(input->owner_graph(), config, "mark_contiguous", {input}) {
    add_input({input});
    add_output(None);
 }

 SymbolVar MarkInputContiguous::make(SymbolVar input,
                                    const OperatorNodeConfig& config) {
    return input.insert_single_output_opr<MarkInputContiguous>(input.node(),
                                                               config);
 }

 void MarkInputContiguous::init_output_static_infer_desc() {
    using namespace cg::static_infer;
    auto&& mgr = owner_graph()->static_infer_manager();
    mgr.register_shape_infer(output(0),
                             ShapeInferDesc::make_identity(input(0)));
 }
 }  // namespace

 /* ================== ProfilerImpl =================*/
 class ProfilerImpl final : public ProfilerBase {
 public:
    ProfilerImpl(int runs = 10) : m_runs{runs} {};
    ~ProfilerImpl() = default;
    ProfilingResult profile(const Problem& problem) const override;

 private:
    static constexpr float PROFILE_TIME_OUT = 1e7;
    /*!
     * \brief profile opr format agnostic operators (like elemwise, elemwise multi type, typecvt etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_format the original tensor format of the operator node.
     * \param available_tensor_formats the available tensor formats 
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
            const OperatorNodeBase* opr, TensorFormats base_format,
            const SmallVector<TensorFormats>& available_tensor_formats) const;
    float profile_operator(const OperatorNodeBase* opr,
                           TensorFormats base_format,
                           TensorFormats tensor_format) const;
    /*!
     * \brief profile opr format aware operators (like conv, deconv, conv_bias, etc.)
     *
     * \param opr pointer to the operator node to be profiled
     * \param base_config the tensor formats configuration of base opr format
     * \param config all the available configuration  
     * \return the operator node record
     */
    OperatorNodeRecord profile_operator(
            const OperatorNodeBase* opr,
            const OprTensorFormatsConfiguration& base_config,
            const SmallVector<OprTensorFormatsConfiguration>& available_configs)
            const;
    float profile_operator(const OperatorNodeBase* opr,
                           const OprTensorFormatsConfiguration& base_config,
                           const OprTensorFormatsConfiguration& config) const;
    /*!
     * \brief profile layout transform of the var node
     *
     * \param var pointer to the var node to be profiled
     * \param base_format the original tensor formats in which the var node is stored
     * \param available_tensor_formats the available tensor formats
     * \param extra_attribute the extra attributes (options) of the problem
     * \return the var node record
     */
    VarNodeRecord profile_var_node(
            const VarNode* var, TensorFormats base_format,
            const SmallVector<TensorFormats>& available_tensor_formats,
            ReformatKey::Attribute extra_attribute =
                    ReformatKey::Attribute::DEFAULT) const;
    float profile_var_node(const VarNode* var, TensorFormats base_format,
                           const ReformatKey& key) const;
    int m_runs; /// sample times of the profiler
 };

 ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
        const OperatorNodeBase* opr, TensorFormats base_format,
        const SmallVector<TensorFormats>& available_tensor_formats) const {
    OperatorNodeRecord record;
    record.opr = opr;
    auto& costs = record.costs;
    for (auto&& f : available_tensor_formats) {
        auto opr_format = tensor_formats_to_opr_format(f);
        costs[opr_format] = profile_operator(opr, base_format, f);
    }
    return record;
 }

 float ProfilerImpl::profile_operator(const OperatorNodeBase* opr,
                                     TensorFormats base_format,
                                     TensorFormats tensor_format) const {
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    graph->options().var_sanity_check_first_run = false;
    VarNodeArray new_inps(opr->input().size());
    for (size_t i = 0; i < opr->input().size(); ++i) {
        auto&& var = opr->input(i);
        auto&& cn = var->comp_node();
        auto&& dtype = var->dtype();
        auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
        auto aligned_tensor_shape =
                make_aligned_tensor_shape(var, base_format, tensor_format);
        dval->resize(aligned_tensor_shape);
        auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
        new_inps[i] = aligned_var.node();
    }
    auto new_opr = serialization::copy_opr_shallow(
            *opr, new_inps, opr->config(), {graph.get()});
    auto y = new_opr->output(0);
    auto mark = MarkInputContiguous::make(SymbolVar(y));
    auto func = graph->compile({{mark, {}}});
    auto filter = [new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
    auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
                                                             std::move(filter));
    for (int i = 0; i < m_runs; ++i)
        func->execute();
    return profiler->duration_in_usec();
 }

 ProfilerImpl::OperatorNodeRecord ProfilerImpl::profile_operator(
        const OperatorNodeBase* opr,
        const OprTensorFormatsConfiguration& base_config,
        const SmallVector<OprTensorFormatsConfiguration>& available_configs)
        const {
    OperatorNodeRecord record;
    record.opr = opr;
    auto& costs = record.costs;
    for (auto&& i : available_configs) {
        costs[i.opr_format] = profile_operator(opr, base_config, i);
    }
    return record;
 }

 float ProfilerImpl::profile_operator(
        const OperatorNodeBase* opr,
        const OprTensorFormatsConfiguration& base_config,
        const OprTensorFormatsConfiguration& config) const {
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    graph->options().var_sanity_check_first_run = false;
    VarNodeArray new_inps(opr->input().size());
    size_t i = 0;
    size_t nr_input_tensor =
            std::min(config.input_tensor_formats.size(), opr->input().size());
    for (; i < nr_input_tensor; ++i) {
        auto&& var = opr->input(i);
        auto&& cn = var->comp_node();
        auto&& dtype = var->dtype();
        auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
        TensorShape aligned_shape;
        if (config.input_tensor_types[i] == TensorType::WEIGHT) {
            mgb_assert(base_config.input_tensor_types[i] == TensorType::WEIGHT);
            aligned_shape = make_aligned_weight_shape(
                    var, base_config.input_tensor_formats[i],
                    config.input_tensor_formats[i],
                    config.output_tensor_formats[0]);
        } else {
            mgb_assert(base_config.input_tensor_types[i] ==
                       config.input_tensor_types[i]);
            mgb_assert(base_config.input_tensor_types[i] ==
                       TensorType::FEATURE);
            aligned_shape = make_aligned_tensor_shape(
                    var, base_config.input_tensor_formats[i],
                    config.input_tensor_formats[i]);
        }
        dval->resize(aligned_shape);
        auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
        new_inps[i] = aligned_var.node();
    }
    for (; i < opr->input().size(); ++i) {
        auto&& var = opr->input(i);
        auto&& cn = var->comp_node();
        auto&& dtype = var->dtype();
        auto hval = std::make_shared<HostTensorND>(cn, dtype);
        hval->resize(var->shape());
        auto cb = [&](DeviceTensorND& d) { hval->copy_from(d).sync(); };
        {
            auto cg = var->owner_graph();
            cg->compile({{var, cb}})->execute();
        }
        auto imm = opr::ImmutableTensor::make(*graph, *hval);
        new_inps[i] = imm.node();
    }
    VarNode* y = mgb::gopt::intl::modify_opr_format(config.opr_format, new_inps,
                                                    opr);
 #if 0
    static const ThinHashSet<Typeinfo*> multi_algo_oprs = {
            opr::Convolution::typeinfo(),
            opr::ConvBiasForward::typeinfo(),
            opr::ConvolutionBackwardData::typeinfo(),
            opr::PoolingForward::typeinfo(),
    };
    if (multi_algo_oprs.count(opr->dyn_typeinfo()) &&
        !mgb::gopt::intl::has_available_algo(new_inps, y->owner_opr()))
        return PROFILE_TIME_OUT;
 #endif
    auto mark = MarkInputContiguous::make(SymbolVar(y));
    auto func = graph->compile({{mark, {}}});
    auto new_opr = y->owner_opr();
    auto filter = [&new_opr](OperatorNodeBase* opr) { return opr == new_opr; };
    auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
                                                             std::move(filter));
    for (int i = 0; i < m_runs; ++i)
        func->execute();
    return profiler->duration_in_usec();
 }

 ProfilerImpl::VarNodeRecord ProfilerImpl::profile_var_node(
        const VarNode* var, TensorFormats base_format,
        const SmallVector<TensorFormats>& available_tensor_formats,
        ReformatKey::Attribute attribute) const {
    VarNodeRecord record;
    record.var = var;
    auto& costs = record.costs;
    for (auto&& i : available_tensor_formats) {
        for (auto&& o : available_tensor_formats) {
            if (i == o)
                continue;
            ReformatKey key{i, o, attribute, var->dtype().enumv(),
                            var->dtype().enumv()};
            costs[{i, o}] = profile_var_node(var, base_format, key);
        }
    }
    return record;
 }

 float ProfilerImpl::profile_var_node(const VarNode* var,
                                     TensorFormats base_format,
                                     const ReformatKey& key) const {
    auto&& cn = var->comp_node();
    auto&& dtype = var->dtype();
    auto dval = std::make_shared<DeviceTensorND>(cn, dtype);
    auto aligned_tensor_shape =
            make_aligned_tensor_shape(var, base_format, key.input_format);
    dval->resize(aligned_tensor_shape);
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    graph->options().var_sanity_check_first_run = false;
    auto aligned_var = opr::VolatileSharedDeviceTensor::make(*graph, dval);
    auto builder = ReformatManager::instance().auto_aligned_reformat_featrue(
            var, base_format, key);
    auto y = builder({aligned_var.node()});
    ThinHashSet<OperatorNodeBase*> set;
    DepOprIter iter([&set](OperatorNodeBase* opr) { set.insert(opr); });
    iter.add(y->owner_opr());
    iter.set_visited(aligned_var.node()->owner_opr());
    auto mark = MarkInputContiguous::make(SymbolVar(y));
    auto func = graph->compile({{mark, {}}});
    auto filter = [&set](OperatorNodeBase* opr) { return set.count(opr) > 0; };
    auto profiler = std::make_unique<GraphPartitionProfiler>(graph.get(),
                                                             std::move(filter));
    for (int i = 0; i < m_runs; ++i)
        func->execute();
    return profiler->duration_in_usec();
 }

 ProfilerImpl::ProfilingResult ProfilerImpl::profile(
        const Problem& problem) const {
    ConstVarPropogate cvprop{ConstVarType::IMMUTABLE_AND_PARAM};
    {
        auto cb = [&cvprop](OperatorNodeBase* opr) { cvprop.add_opr(opr); };
        DepOprIter iter{cb};
        for (auto&& o : problem.graph_partition().output()) {
            iter.add(o->owner_opr());
        }
    }

    static const ThinHashMap<Typeinfo*, size_t> format_aware_input_tensors = {
 #define cb(_Opr, _arity) {_Opr::typeinfo(), _arity}
            cb(Convolution, 2),
            cb(ConvBiasForward, 4),
            cb(ConvolutionBackwardData, 2),
            cb(PoolingForward, 1),
            cb(WarpPerspective, 1),
            cb(Resize, 1),
 #undef cb
    };
    ThinHashSet<VarNode*> vars;
    ThinHashSet<OperatorNodeBase*> oprs;
    {
        auto cb = [&cvprop, &vars, &oprs](OperatorNodeBase* opr) {
            if (cvprop.is_const(opr))
                return;
            oprs.insert(opr);
            auto find = format_aware_input_tensors.find(opr->dyn_typeinfo());
            if (find == format_aware_input_tensors.end()) {
                for (auto&& i : opr->input()) {
                    if (!cvprop.is_const(i)) {
                        vars.insert(i);
                    }
                }
            } else {
                size_t nr_input_tensor =
                        std::min(find->second, opr->input().size());
                for (size_t i = 0; i < nr_input_tensor; ++i) {
                    if (!cvprop.is_const(opr->input(i))) {
                        vars.insert(opr->input(i));
                    }
                }
            }
            vars.insert(opr->output(0));
        };
        DepOprIter iter{cb};
        for (auto&& i : problem.graph_partition().input()) {
            iter.set_visited(i->owner_opr());
        }
        for (auto&& o : problem.graph_partition().output()) {
            iter.add(o->owner_opr());
        }
    }

    auto base_format = problem.base_format();
    auto&& available_tensor_formats = problem.available_tensor_formats();

    ProfilingResult profiling_result;
    auto& opr_record = profiling_result.opr_record;
    auto& var_record = profiling_result.var_record;
    for (auto&& var : vars) {
        var_record[var] =
                profile_var_node(var, base_format, available_tensor_formats);
    }
    for (auto&& opr : oprs) {
        auto&& opr_configs = problem.opr_configs();
        auto find = opr_configs.find(opr->dyn_typeinfo());
        if (find == opr_configs.end()) {
            opr_record[opr] = profile_operator(opr, base_format,
                                               available_tensor_formats);
        } else {
            auto&& dispatchers = find->second;
            SmallVector<OprTensorFormatsConfiguration> configs;
            for (const auto& item : dispatchers) {
                auto config = (*item.second)(opr);
                if (config.valid()) {
                    configs.emplace_back(config.val());
                }
            }
            auto base_config = problem.base_config(opr);
            opr_record[opr] = profile_operator(opr, base_config, configs);
        }
    }
    for (auto&& rpair : opr_record) {
        mgb_log_debug("%s", rpair.second.to_string().c_str());
    }
    for (auto&& rpair : var_record) {
        mgb_log_debug("%s", rpair.second.to_string().c_str());
    }
    return profiling_result;
 }

 /* ================== ProfilerBase =================*/
 std::string ProfilerBase::OperatorNodeRecord::to_string() const {
    auto str = ssprintf("\nopr type: %s\nopr name: %s\ninputs:\n",
                        opr->dyn_typeinfo()->name, opr->cname());
    for (auto&& i : opr->input()) {
        str += ssprintf("\tvar: %s\n\tshape: %s\n", i->cname(),
                        i->shape().to_string().c_str());
    }
    str += ssprintf("outputs:\n\tvar: %s\n\tshape: %s\ncosts:\n",
                    opr->output(0)->cname(),
                    opr->output(0)->shape().to_string().c_str());
    for (auto&& cpair : costs) {
        str += ssprintf("\tformat: %s; cost:%f",
                        opr_format_to_string(cpair.first), cpair.second);
    }
    return str;
 }

 std::string ProfilerBase::VarNodeRecord::to_string() const {
    auto str = ssprintf("\nvar: %s\ncosts:", var->cname());
    for (auto&& cpair : costs) {
        auto&& formats = cpair.first;
        str += ssprintf("\n\tformat: (i:%s;o:%s); cost:%f",
                        tensor_formats_to_named_tensor_shape(formats.first)
                                .to_string()
                                .c_str(),
                        tensor_formats_to_named_tensor_shape(formats.second)
                                .to_string()
                                .c_str(),
                        cpair.second);
    }
    return str;
 }

 std::unique_ptr<ProfilerBase> ProfilerBase::make_profiler() {
    return std::make_unique<ProfilerImpl>();
 }

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/reformat_emitter.cpp
+++ b/src/gopt/impl/reformat_emitter.cpp
@@ -247,16 +247,36 @@ ReformatEmitter::UnderlyingBuilders ReformatEmitter::analyze() const {

 /* ============== PaddingEmitter ================= */
 PaddingEmitter::EmitResult PaddingEmitter::emit() const {
    auto&& padshp = m_padshp;
    auto&& const_extent = m_const_extent;
    auto&& axis = m_axis;
    auto builder = [const_extent, axis](const VarNodeArray& vars) {
    auto builder = [padshp, const_extent, axis](const VarNodeArray& vars) {
        auto i = vars[0];
        auto padding_shp_var = vars[1];
        TensorShape shape;
        shape.ndim = i->shape().ndim;
        for (size_t ax = 0; ax < shape.ndim; ++ax)
            shape[ax] = 1;
        shape[axis] = const_extent;
        // avoid making a scalar lowbit tensor
        if (!i->dtype().is_low_bit() || const_extent != 1)
            shape[axis] = const_extent;
        else {
            size_t const_axis = 0;
            size_t new_const_extent = const_extent;
            for (size_t i = 0; i < padshp.ndim; ++i) {
                const auto& dim = padshp[i];
                if (dim.extent() != Dimension::UNDETERMINED_EXTENT &&
                    dim.extent() != 1) {
                    new_const_extent = dim.extent();
                    const_axis = i;
                    break;
                }
            }
            mgb_assert(new_const_extent != 1,
                       "cannot make an scalar lowbit tensor(got:%s)",
                       i->dtype().name());
            shape[const_axis] = new_const_extent;
        }
        auto host_val =
                std::make_shared<HostTensorND>(i->comp_node(), i->dtype());
        host_val->resize(shape);
--- a/src/gopt/impl/reformat_manager.cpp
+++ b/src/gopt/impl/reformat_manager.cpp
@@ -13,6 +13,7 @@
 #include "megbrain/gopt/reformat_manager.h"
 #include "megbrain/opr/tensor_manip.h"
 #include "megbrain/utils/arith_helper.h"
 #include "./utils.h"

 using namespace mgb;
 using namespace gopt;
@@ -32,68 +33,6 @@ int gcd(const int& p, const int& q) {
    }
    return x;
 }

 NamedTensorShape tensor_formats_to_named_tensor_shape(TensorFormats format) {
    switch (format) {
        case TensorFormats::NCHW:
            return {{"N"}, {"C"}, {"H"}, {"W"}};
        case TensorFormats::NHWC:
            return {{"N"}, {"H"}, {"W"}, {"C"}};
        case TensorFormats::NCHWc4:
            return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
        case TensorFormats::NCHWc8:
            return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
        case TensorFormats::NCHWc32:
            return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
        case TensorFormats::NCHWc64:
            return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
        case TensorFormats::CHWNc4:
            return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
        case TensorFormats::NHCWc4:
            return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
        case TensorFormats::KRSCk4:
            return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
        case TensorFormats::GKRSCk4:
            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
        case TensorFormats::C1RSc4:
            return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::KRSCk4c4:
            return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
        case TensorFormats::GKRSCk4c4:
            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
        case TensorFormats::KCRSk4c4:
            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
        case TensorFormats::GKCRSk4c4:
            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
        case TensorFormats::KCRSc4k4:
            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
        case TensorFormats::GKCRSc4k4:
            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
        case TensorFormats::C11RSc4:
            return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::KCRSc8k8:
            return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
        case TensorFormats::GKCRSc8k8:
            return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
        case TensorFormats::C11RSc8:
            return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
        case TensorFormats::KRSCk8:
            return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
        case TensorFormats::KCRSc4:
            return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::GKCRSc4:
            return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::KCRS:
            return {{"K"}, {"C"}, {"R"}, {"S"}};
        case TensorFormats::GKCRS:
            return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
        case TensorFormats::C11RS:
            return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
        default:
            mgb_throw(AssertionError, "invalid tensor formats(%u)",
                      static_cast<uint32_t>(format));
    }
 }
 };  // namespace

 // =================== ReformatManager::ReformatKey ====================*/
@@ -393,8 +332,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            tensor_formats_to_named_tensor_shape(key.input_format);
    NamedTensorShape output_shape =
            tensor_formats_to_named_tensor_shape(key.output_format);
    size_t input_alignment, output_alignment;
    size_t input_channel_idx, output_channel_idx;
    size_t input_alignment = 0;
    size_t output_alignment = 0;
    size_t input_channel_idx = input_shape.ndim,
           output_channel_idx = input_shape.ndim;
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -411,6 +352,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            break;
        }
    }
    mgb_assert(input_channel_idx < input_shape.ndim &&
                       output_channel_idx < input_shape.ndim,
               "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
               input_channel_idx, output_channel_idx,
               input_shape.to_string().c_str());
    mgb_assert(input_alignment > 0 && output_alignment > 0,
               "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
               input_alignment, output_alignment,
               input_shape.to_string().c_str());
    NamedTensorShape orig_shape =
            tensor_formats_to_named_tensor_shape(orig_format);
    size_t orig_channel = 0;
@@ -448,8 +398,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_featrue(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({x});
            auto padding = std::get<0>(
                    PaddingEmitter{const_extent, input_channel_idx}.emit());
            auto padding = std::get<0>(PaddingEmitter{
                    padding_shape, const_extent, input_channel_idx}
                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        cur = ReformatManager::instance().get(key)({cur});
@@ -469,9 +420,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
        const VarNode* orig_var, const ReformatKey& key,
        const AlignmentDesc& extra_alignment) const {
    size_t in_channels = 0, out_channels = 0;
    size_t input_channel_idx, output_channel_idx;
    Dimension::Name out_channel_name;
    Dimension::Name out_channel_name = Dimension::Name::C;
    auto input_shape = tensor_formats_to_named_tensor_shape(key.input_format);
    size_t input_channel_idx = input_shape.ndim,
           output_channel_idx = input_shape.ndim;
    for (size_t i = 0; i < input_shape.ndim; ++i) {
        if (input_shape[i].name() == Dimension::Name::C &&
            input_shape[i].extent() == Dimension::UNDETERMINED_EXTENT) {
@@ -491,7 +443,15 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
                       input_shape.to_string().c_str());
        }
    }
    size_t in_channel_alignment, out_channel_alignment = 1;
    mgb_assert(out_channel_name == Dimension::Name::K ||
                       out_channel_name == Dimension::Name::N,
               "invalid out channel(shp:%s)", input_shape.to_string().c_str());
    mgb_assert(input_channel_idx < input_shape.ndim &&
                       output_channel_idx < input_shape.ndim,
               "invalid channel idx(in_channel:%zu, out_channel:%zu, shp:%s)",
               input_channel_idx, output_channel_idx,
               input_shape.to_string().c_str());
    size_t in_channel_alignment = 0, out_channel_alignment = 0;
    auto output_shape = tensor_formats_to_named_tensor_shape(key.output_format);
    for (size_t i = 0; i < output_shape.ndim; ++i) {
        if (output_shape[i].name() == Dimension::Name::C &&
@@ -502,6 +462,10 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            out_channel_alignment = output_shape[i].stride();
        }
    }
    mgb_assert(in_channel_alignment > 0 && out_channel_alignment > 0,
               "invalid alignment(in_channel:%zu, out_channel:%zu, shp:%s)",
               in_channel_alignment, out_channel_alignment,
               output_shape.to_string().c_str());
    size_t aligned_in_channel =
            divup(in_channels, in_channel_alignment) * in_channel_alignment;
    if (extra_alignment.name == out_channel_name) {
@@ -526,8 +490,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({x});
            auto padding = std::get<0>(
                    PaddingEmitter{const_extent, input_channel_idx}.emit());
            auto padding = std::get<0>(PaddingEmitter{
                    padding_shape, const_extent, input_channel_idx}
                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        if (aligned_out_channel > out_channels) {
@@ -540,8 +505,9 @@ ReformatManager::ReformatImpl ReformatManager::auto_aligned_reformat_weight(
            auto make_shape = std::get<0>(
                    MakeShapeEmitter{input_shape, padding_shape}.emit());
            auto padding_shp_var = make_shape({cur});
            auto padding = std::get<0>(
                    PaddingEmitter{const_extent, output_channel_idx}.emit());
            auto padding = std::get<0>(PaddingEmitter{
                    padding_shape, const_extent, output_channel_idx}
                                               .emit());
            cur = padding({cur, padding_shp_var});
        }
        cur = ReformatManager::instance().get(key)({cur});
@@ -554,4 +520,81 @@ const ReformatManager& ReformatManager::instance() {
    static ReformatManager inst;
    return inst;
 }

 TensorShape mgb::gopt::make_aligned_tensor_shape(const VarNode* var,
                                                 TensorFormats orig_formats,
                                                 TensorFormats target_formats) {
    using Dimension = megdnn::Dimension;
    static constexpr uint32_t UNDETERMINED_EXTENT =
            Dimension::UNDETERMINED_EXTENT;
    auto orig_shape = tensor_formats_to_named_tensor_shape(orig_formats);
    auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);

    TensorShape oshp = var->shape();
    mgb_assert(oshp.is_scalar() || oshp.ndim == orig_shape.ndim,
               "orig shape of var node is not compatible with tensor "
               "formats(var:%s;shp:%s;fmt:%s)",
               var->cname(), oshp.to_string().c_str(),
               orig_shape.to_string().c_str());
    if (oshp.is_scalar()) return oshp;
    TensorShape tshp;
    ThinHashMap<Dimension::Name, int> name2dominant;
    for (size_t i = 0; i < orig_shape.ndim; ++i) {
        auto name = orig_shape[i].name();
        if (orig_shape[i].extent() == UNDETERMINED_EXTENT) {
            auto insert = name2dominant.insert(std::make_pair(name, i));
            mgb_assert(insert.second);
        }
    }

    tshp.ndim = target_shape.ndim;
    for (size_t i = 0; i < target_shape.ndim; ++i) {
        auto name = target_shape[i].name();
        if (target_shape[i].extent() == UNDETERMINED_EXTENT) {
            int idx = name2dominant.at(name);
            bool mul = orig_shape[idx] < target_shape[i];
            size_t factor = mul ? (target_shape[i] / orig_shape[idx]).extent()
                                : (orig_shape[idx] / target_shape[i]).extent();
            if (mul)
                tshp[i] = oshp[idx] * factor;
            else
                tshp[i] = divup(oshp[idx], factor);
        } else {
            tshp[i] = target_shape[i].extent();
        }
    }
    return tshp;
 }

 TensorShape mgb::gopt::make_aligned_weight_shape(const VarNode* var,
                                                 TensorFormats orig_formats,
                                                 TensorFormats target_formats,
                                                 TensorFormats extra_formats) {
    auto tshp = make_aligned_tensor_shape(var, orig_formats, target_formats);
    auto extra_shape = tensor_formats_to_named_tensor_shape(extra_formats);
    using Dimension = megdnn::Dimension;
    static constexpr uint32_t UNDETERMINED_EXTENT =
            Dimension::UNDETERMINED_EXTENT;
    size_t out_channel_alignment = 1;
    for (size_t i = 0; i < extra_shape.ndim; ++i) {
        auto name = extra_shape[i].name();
        if (name == Dimension::Name::C &&
            extra_shape[i].extent() == UNDETERMINED_EXTENT) {
            out_channel_alignment = extra_shape[i].stride();
        }
    }

    auto target_shape = tensor_formats_to_named_tensor_shape(target_formats);
    for (size_t i = 0; i < target_shape.ndim; ++i) {
        auto name = target_shape[i].name();
        if ((name == Dimension::Name::K || name == Dimension::Name::N) &&
            target_shape[i].extent() == UNDETERMINED_EXTENT) {
            size_t out_channels = tshp[i] * target_shape[i].stride();
            tshp[i] = divup(out_channels, out_channel_alignment) *
                      out_channel_alignment / target_shape[i].stride();
        }
    }
    return tshp;
 }

 // vim: syntax=cpp.doxygen
--- a/src/gopt/impl/utils.h
+++ b/src/gopt/impl/utils.h
@@ -0,0 +1,105 @@
 /**
 * \file src/gopt/impl/utils.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megbrain/gopt/global_layout_transform.h"

 namespace mgb {
 namespace gopt {

 static inline const char* opr_format_to_string(
        OprTensorFormatsConfiguration::OprFormat opr_format) {
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
 #define cb(_fmt)          \
    case OprFormat::_fmt: \
        return #_fmt
    switch (opr_format) {
        cb(NCHW);
        cb(NHWC);
        cb(NCHW4);
        cb(NCHW32);
        cb(NCHW64);
        cb(CHWN4);
        default:
            mgb_assert(false, "Invalid opr format(got:%u)",
                       static_cast<uint32_t>(opr_format));
    }
 #undef cb
 }

 static inline megdnn::NamedTensorShape tensor_formats_to_named_tensor_shape(
        TensorFormats format) {
    switch (format) {
        case TensorFormats::NCHW:
            return {{"N"}, {"C"}, {"H"}, {"W"}};
        case TensorFormats::NHWC:
            return {{"N"}, {"H"}, {"W"}, {"C"}};
        case TensorFormats::NCHWc4:
            return {{"N"}, {"C//4"}, {"H"}, {"W"}, {"C%4"}};
        case TensorFormats::NCHWc8:
            return {{"N"}, {"C//8"}, {"H"}, {"W"}, {"C%8"}};
        case TensorFormats::NCHWc32:
            return {{"N"}, {"C//32"}, {"H"}, {"W"}, {"C%32"}};
        case TensorFormats::NCHWc64:
            return {{"N"}, {"C//64"}, {"H"}, {"W"}, {"C%64"}};
        case TensorFormats::CHWNc4:
            return {{"C//4"}, {"H"}, {"W"}, {"N"}, {"C%4"}};
        case TensorFormats::NHCWc4:
            return {{"N"}, {"H"}, {"C//4"}, {"W"}, {"C%4"}};
        case TensorFormats::KRSCk4:
            return {{"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
        case TensorFormats::GKRSCk4:
            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C"}, {"K%4"}};
        case TensorFormats::C1RSc4:
            return {{"C//4"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::KRSCk4c4:
            return {{"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
        case TensorFormats::GKRSCk4c4:
            return {{"G"}, {"K//4"}, {"R"}, {"S"}, {"C//4"}, {"K%4"}, {"C%4"}};
        case TensorFormats::KCRSk4c4:
            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
        case TensorFormats::GKCRSk4c4:
            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"K%4"}, {"C%4"}};
        case TensorFormats::KCRSc4k4:
            return {{"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
        case TensorFormats::GKCRSc4k4:
            return {{"G"}, {"K//4"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}, {"K%4"}};
        case TensorFormats::C11RSc4:
            return {{"C//4"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::KCRSc8k8:
            return {{"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
        case TensorFormats::GKCRSc8k8:
            return {{"G"}, {"K//8"}, {"C//8"}, {"R"}, {"S"}, {"C%8"}, {"K%8"}};
        case TensorFormats::C11RSc8:
            return {{"C//8"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}, {"C%8"}};
        case TensorFormats::KRSCk8:
            return {{"K//8"}, {"R"}, {"S"}, {"C"}, {"K%8"}};
        case TensorFormats::KCRSc4:
            return {{"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::GKCRSc4:
            return {{"G"}, {"K"}, {"C//4"}, {"R"}, {"S"}, {"C%4"}};
        case TensorFormats::KCRS:
            return {{"K"}, {"C"}, {"R"}, {"S"}};
        case TensorFormats::GKCRS:
            return {{"G"}, {"K"}, {"C"}, {"R"}, {"S"}};
        case TensorFormats::C11RS:
            return {{"C"}, {"C%1"}, {"C%1"}, {"R"}, {"S"}};
        default:
            mgb_throw(AssertionError, "invalid tensor formats(%u)",
                      static_cast<uint32_t>(format));
    }
 }

 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen
--- a/src/gopt/include/megbrain/gopt/global_layout_transform.h
+++ b/src/gopt/include/megbrain/gopt/global_layout_transform.h
@@ -0,0 +1,176 @@
 /**
 * \file src/gopt/include/megbrain/gopt/global_layout_transformation.h
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #pragma once
 #include "megbrain/gopt/reformat_manager.h"
 #include "megbrain/gopt/subgraph_extractor.h"
 #include "megbrain/opr/dnn/convolution.h"

 namespace mgb {
 namespace gopt {

 /*!
 * \brief A structure that describe the data types and  tensor formats
 * configuration of the opr format
 */
 struct OprTensorFormatsConfiguration {
    using OprFormat = opr::ConvBias::Param::Format;
    using OprTensorFormatsDispatcher =
            thin_function<Maybe<OprTensorFormatsConfiguration>(
                    const cg::OperatorNodeBase*)>;
    Typeinfo* typeinfo;
    OprFormat opr_format;
    SmallVector<DTypeEnum> input_dtypes;
    SmallVector<DTypeEnum> output_dtypes;
    SmallVector<TensorFormats> input_tensor_formats;
    SmallVector<TensorType> input_tensor_types;
    SmallVector<TensorFormats> output_tensor_formats;
    static OprTensorFormatsDispatcher* find_dispatcher_by_type_format(
            Typeinfo* type, OprFormat opr_format);
 };

 /*!
 * \brief A structure that describes the global layout transform problem
 */
 class Problem {
 public:
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    using OprTensorFormatsDispatcher =
            OprTensorFormatsConfiguration::OprTensorFormatsDispatcher;
    using OprConfigTrait =
            ThinHashMap<Typeinfo*,
                        ThinHashMap<OprFormat, OprTensorFormatsDispatcher*>>;
    struct Attribute {
        OprFormat base_opr_format;  /// the base opr format indicates that the
                                    /// network to be optimized is constructed
                                    /// in the base opr format, i.e. all the
                                    /// format aware operators (conv, conv_bias,
                                    /// deconv, pooling etc.) are built in
                                    /// this format.
        TensorFormats
                base_tensor_formats;  /// the base tensor format indicates that
                                      /// all the format agnostic operators
                                      /// (like elemwise, elemwise multi type,
                                      /// typecvt etc.) are built in the base
                                      /// tensor format.
    };
    Problem(const GraphPartition& graph_partition,
            const SmallVector<TensorFormats>& available_tensor_formats,
            const OprConfigTrait& opr_config, const Attribute& attribute)
            : m_graph_partition{graph_partition},
              m_available_tensor_formats{available_tensor_formats},
              m_opr_configs{opr_config},
              m_attribute{attribute} {}
    ~Problem() noexcept = default;

    const GraphPartition& graph_partition() const { return m_graph_partition; }
    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
    const SmallVector<TensorFormats>& available_tensor_formats() const {
        return m_available_tensor_formats;
    }
    TensorFormats base_format() const {
        return m_attribute.base_tensor_formats;
    }
    OprTensorFormatsConfiguration base_config(
            const cg::OperatorNodeBase* opr) const {
        auto _ = OprTensorFormatsConfiguration::find_dispatcher_by_type_format(
                opr->dyn_typeinfo(), m_attribute.base_opr_format);
        auto rst = (*_)(opr);
        if (rst.valid())
            return rst.val();
        OprTensorFormatsConfiguration config;
        config.typeinfo = opr->dyn_typeinfo();
        config.opr_format = m_attribute.base_opr_format;
        for (const auto& i : opr->input()) {
            config.input_dtypes.emplace_back(i->dtype().enumv());
            config.input_tensor_formats.emplace_back(
                    m_attribute.base_tensor_formats);
            config.input_tensor_types.emplace_back(TensorType::FEATURE);
        }
        config.output_dtypes.emplace_back(opr->output(0)->dtype().enumv());
        config.output_tensor_formats.emplace_back(
                m_attribute.base_tensor_formats);
        return config;
    }

 private:
    const GraphPartition& m_graph_partition;  /// the graph partition
    const SmallVector<TensorFormats>&
            m_available_tensor_formats;  /// the available tensor formats, used
                                         /// for format agnostic operators (like
                                         /// elemwise, elemwise multi type,
                                         /// typecvt, etc.
    const OprConfigTrait&
            m_opr_configs;  /// the available opr format configurations, used
                            /// for format aware operators (like conv, deconv,
                            /// conv_bias, etc.
    Attribute m_attribute;  /// the extra attributes to describe the problem
 };

 /*!
 * \brief A profiler that collects all the performance data to describe the
 * global layout transform problem.
 */
 class ProfilerBase {
 public:
    using OprFormat = Problem::OprFormat;
    struct OperatorNodeRecord {
        const cg::OperatorNodeBase* opr;  ///< pointer to operator node
        ThinHashMap<OprFormat, float>
                costs;  ///< costs of operator node, i.e. the elapsed device
                        ///< time of the operator node on different opr format
                        ///< (layout configuration).
        std::string to_string() const;
    };
    struct VarNodeRecord {
        struct KeyHash {
            size_t operator()(
                    const std::pair<TensorFormats, TensorFormats>& val) const {
                size_t h1 =
                        std::hash<uint32_t>()(static_cast<uint32_t>(val.first));
                size_t h2 = std::hash<uint32_t>()(
                        static_cast<uint32_t>(val.second));
                return mgb::hash_pair_combine(h1, h2);
            }
        };
        const VarNode* var;  ///< pointer to var node
        std::unordered_map<std::pair<TensorFormats, TensorFormats>, float,
                           KeyHash>
                costs;  ///< costs of var node, i.e. the elapsed
                        ///< device time of the layout transform.
                        ///< Key of the hashmap indicates the
                        ///< source tensor format and the target
                        ///< tensor format.
        std::string to_string() const;
    };
    /*!
     * \note the profiler assumes all the input and output var node are stored
     * in contiguous layout in memory
     */
    struct ProfilingResult {
        /// A hashmap, that maps the operator node to the costs (device elapsed
        /// time) of different layouts configuration
        ThinHashMap<cg::OperatorNodeBase*, OperatorNodeRecord> opr_record;
        /// A hashmap, that maps the var node to the costs of layout transform
        ThinHashMap<VarNode*, VarNodeRecord> var_record;
    };

    ProfilerBase() = default;
    virtual ~ProfilerBase() = default;
    virtual ProfilingResult profile(const Problem& problem) const = 0;
    static std::unique_ptr<ProfilerBase> make_profiler();
 };

 }  // namespace gopt
 }  // namespace mgb

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/include/megbrain/gopt/reformat_emitter.h
+++ b/src/gopt/include/megbrain/gopt/reformat_emitter.h
@@ -80,11 +80,13 @@ private:

 class PaddingEmitter final : public Emitter {
 public:
    PaddingEmitter(size_t const_extent, size_t axis)
            : m_const_extent{const_extent}, m_axis{axis} {}
    PaddingEmitter(const megdnn::NamedTensorShape& padshp, size_t const_extent,
                   size_t axis)
            : m_padshp{padshp}, m_const_extent{const_extent}, m_axis{axis} {}
    EmitResult emit() const override;

 private:
    megdnn::NamedTensorShape m_padshp;
    size_t m_const_extent, m_axis;
 };

--- a/src/gopt/include/megbrain/gopt/reformat_manager.h
+++ b/src/gopt/include/megbrain/gopt/reformat_manager.h
@@ -17,6 +17,11 @@
 namespace mgb {
 namespace gopt {

 enum class TensorType : uint32_t {
    FEATURE = 0,
    WEIGHT = 1,
 };

 enum class TensorFormats : uint32_t {
    // input tensor formats
    NCHW = 0,     ///< [N, C, H, W]
@@ -116,6 +121,15 @@ public:
 private:
    ReformatCache m_cache;
 };

 TensorShape make_aligned_tensor_shape(const VarNode* var,
                                      TensorFormats orig_formats,
                                      TensorFormats target_formats);

 TensorShape make_aligned_weight_shape(const VarNode* var,
                                      TensorFormats orig_formats,
                                      TensorFormats target_formats,
                                      TensorFormats extra_formats);
 }  // namespace gopt
 }  // namespace mgb

--- a/src/gopt/include/megbrain/gopt/subgraph_extractor.h
+++ b/src/gopt/include/megbrain/gopt/subgraph_extractor.h
@@ -20,6 +20,7 @@ class GraphPartition {
 public:
    using VarNodeSet = ThinHashSet<VarNode*>;
    using OperatorNodeSet = ThinHashSet<cg::OperatorNodeBase*>;

    class InputPlaceholder;

    GraphPartition() = default;
@@ -45,13 +46,13 @@ private:
 class SubGraphExtractor {
 public:
    using OprList = ThinHashSet<Typeinfo*>;
    SubGraphExtractor(OprList opr_list) : m_opr_list{opr_list} {};
    SubGraphExtractor(const OprList& opr_list) : m_opr_list{opr_list} {};
    std::vector<GraphPartition> extract(
            const SymbolVarArray& endpoint_vars) const;

 private:
    class Impl;
    OprList m_opr_list;
    const OprList& m_opr_list;
 };

 }  // namespace gopt
--- a/src/gopt/test/profiler.cpp
+++ b/src/gopt/test/profiler.cpp
@@ -0,0 +1,429 @@
 /**
 * \file src/gopt/test/profiler.cpp
 * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 *
 * Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */

 #include "./helper.h"
 #include "megbrain/gopt/global_layout_transform.h"
 #include "megbrain/gopt/inference.h"
 #include "megbrain/opr/dnn/pooling.h"
 #include "megbrain/opr/imgproc.h"
 #include "megbrain/opr/nn_int.h"
 #include "megbrain/serialization/serializer.h"

 using namespace mgb;
 using namespace gopt;
 using namespace serialization;

 namespace {
 class LayoutTransformContext : public NonCopyableObj {
 public:
    using OprList = SubGraphExtractor::OprList;
    using OprFormat = Problem::OprFormat;
    using OprConfigTrait = Problem::OprConfigTrait;

    LayoutTransformContext() = delete;
    LayoutTransformContext(OprList opr_list,
                           SmallVector<TensorFormats> available_tensor_formats,
                           OprConfigTrait opr_configs)
            : m_opr_list{std::move(opr_list)},
              m_available_tensor_formats{std::move(available_tensor_formats)},
              m_opr_configs{std::move(opr_configs)} {}
    const OprList& opr_list() const { return m_opr_list; }
    const SmallVector<TensorFormats>& available_tensor_formats() const {
        return m_available_tensor_formats;
    }
    const OprConfigTrait& opr_configs() const { return m_opr_configs; }
    static std::unique_ptr<LayoutTransformContext> make() {
        OprList opr_list = {
                opr::ConvBiasForward::typeinfo(),
                opr::ConvolutionForward::typeinfo(),
                opr::ConvolutionBackwardData::typeinfo(),
                opr::ElemwiseMultiType::typeinfo(),
                opr::Elemwise::typeinfo(),
                opr::TypeCvt::typeinfo(),
                opr::PoolingForward::typeinfo(),
                opr::WarpPerspectiveForward::typeinfo(),
        };
        OprConfigTrait opr_configs;
        {
            auto& dispatchers = opr_configs[opr::ConvBias::typeinfo()];
 #define cb(_fmt)                                                           \
    dispatchers[OprFormat::_fmt] =                                         \
            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
                    opr::ConvBias::typeinfo(), OprFormat::_fmt);
            cb(NCHW4);
            cb(NCHW32);
            cb(NHWC);
            cb(NCHW64);
            cb(CHWN4);
 #undef cb
        }
        {
            auto& dispatchers =
                    opr_configs[opr::ConvolutionBackwardData::typeinfo()];
 #define cb(_fmt)                                                           \
    dispatchers[OprFormat::_fmt] =                                         \
            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
                    opr::ConvolutionBackwardData::typeinfo(),              \
                    OprFormat::_fmt);
            cb(NCHW4);
 #undef cb
        }

        {
            auto& dispatchers =
                    opr_configs[opr::ConvolutionForward::typeinfo()];
 #define cb(_fmt)                                                           \
    dispatchers[OprFormat::_fmt] =                                         \
            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
                    opr::ConvolutionForward::typeinfo(), OprFormat::_fmt);
            cb(NCHW4);
 #undef cb
        }

        {
            auto& dispatchers = opr_configs[opr::PoolingForward::typeinfo()];
 #define cb(_fmt)                                                           \
    dispatchers[OprFormat::_fmt] =                                         \
            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
                    opr::PoolingForward::typeinfo(), OprFormat::_fmt);
            cb(NCHW4);
            cb(NCHW32);
            cb(NHWC);
            cb(NCHW64);
            cb(CHWN4);
 #undef cb
        }

        {
            auto& dispatchers =
                    opr_configs[opr::WarpPerspectiveForward::typeinfo()];
 #define cb(_fmt)                                                           \
    dispatchers[OprFormat::_fmt] =                                         \
            OprTensorFormatsConfiguration::find_dispatcher_by_type_format( \
                    opr::WarpPerspectiveForward::typeinfo(), OprFormat::_fmt);
            cb(NHWC);
            cb(NCHW4);
            cb(NCHW64);
 #undef cb
        }

        SmallVector<TensorFormats> available_tensor_formats = {
                TensorFormats::NHWC, TensorFormats::NCHWc4,
                TensorFormats::NCHWc32, TensorFormats::NCHWc64};
        return std::make_unique<LayoutTransformContext>(
                std::move(opr_list), std::move(available_tensor_formats),
                std::move(opr_configs));
    }

 private:
    OprList m_opr_list;
    SmallVector<TensorFormats> m_available_tensor_formats;
    OprConfigTrait m_opr_configs;
 };
 };  // namespace

 #if MGB_CUDA
 #if CUDA_VERSION >= 10020
 TEST(TestProfiler, Conv) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
    auto ctx = LayoutTransformContext::make();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkvar = [&](const char* name, const TensorShape& shp,
                     const DType& dtype) {
        return opr::TypeCvt::make(
                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
                dtype);
    };
    auto mkcvar = [&](const char* name, const TensorShape& shp,
                      const DType& dtype) {
        return opr::TypeCvt::make(
                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
                        .rename(name),
                dtype);
    };
    auto x = mkvar("x", {64, 48, 14, 14},
                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
    auto w1 = mkcvar("w1", {48, 48, 3, 3}, dtype::QuantizedS4(2.5f));
    auto b1 = mkcvar("b1", {1, 48, 1, 1}, dtype::QuantizedS32(6.25f));
    opr::ConvBias::Param param;
    param.format = opr::ConvBias::Param::Format::NCHW;
    param.nonlineMode = opr::ConvBias::Param::NonlineMode::IDENTITY;
    param.stride_h = param.stride_w = 1;
    param.pad_h = param.pad_w = 1;
    auto c1 = opr::ConvBias::make(x, w1, b1, param, {},
                                  OperatorNodeConfig(dtype::Quantized4Asymm(
                                          12.345f, static_cast<uint8_t>(5))));
    x = opr::TypeCvt::make(c1, dtype::QuantizedS8(12.345f));
    auto w2 = mkcvar("w2", {48, 48, 3, 3}, dtype::QuantizedS8(2.5f));
    auto b2 = mkcvar("b2", {1, 48, 1, 1}, dtype::QuantizedS32(12.345f * 2.5f));
    auto c2 = opr::ConvBias::make(x, w2, b2, param, {},
                                  OperatorNodeConfig(dtype::QuantizedS8(2.5f)));

    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({c2});
    ASSERT_EQ(partitions.size(), 1u);
    using Attribute = Problem::Attribute;
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
    Problem problem(partitions[0], ctx->available_tensor_formats(),
                    ctx->opr_configs(), attribute);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
    const auto& var_rst = rst.var_record;
    EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
    EXPECT_TRUE(var_rst.count(w1.node()) == 0);
    EXPECT_TRUE(var_rst.count(b1.node()) == 0);
    EXPECT_TRUE(var_rst.count(w2.node()) == 0);
    EXPECT_TRUE(var_rst.count(b2.node()) == 0);
 }
 #endif

 TEST(TestProfiler, Deconv) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
    auto ctx = LayoutTransformContext::make();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkvar = [&](const char* name, const TensorShape& shp,
                     const DType& dtype) {
        return opr::TypeCvt::make(
                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
                dtype);
    };
    auto mkcvar = [&](const char* name, const TensorShape& shp,
                      const DType& dtype) {
        return opr::TypeCvt::make(
                opr::SharedDeviceTensor::make(*graph, *gen(shp, cn))
                        .rename(name),
                dtype);
    };
    auto x = mkvar("x", {64, 10, 7, 7}, dtype::QuantizedS8(2.5f));
    auto w1 = mkcvar("w1", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
    using Param = opr::ConvolutionBackwardData::Param;
    Param param;
    param.format = opr::ConvolutionBackwardData::Param::Format::NCHW;
    param.stride_h = param.stride_w = 2;
    param.pad_h = param.pad_w = 0;
    auto c1 = opr::ConvolutionBackwardData::make(
            w1, x, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));
    auto w2 = mkcvar("w2", {10, 10, 2, 2}, dtype::QuantizedS8(2.5f));
    auto c2 = opr::ConvolutionBackwardData::make(
            w2, c1, param, {}, OperatorNodeConfig(dtype::QuantizedS8(2.5f)));

    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({c2}, strategy);
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({c2});
    ASSERT_EQ(partitions.size(), 1u);
    using Attribute = Problem::Attribute;
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
    Problem problem(partitions[0], ctx->available_tensor_formats(),
                    ctx->opr_configs(), attribute);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
    const auto& var_rst = rst.var_record;
    EXPECT_TRUE(opr_rst.count(c1.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(c2.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
    EXPECT_TRUE(var_rst.count(w1.node()) == 0);
    EXPECT_TRUE(var_rst.count(w2.node()) == 0);
 }

 TEST(TestProfiler, Warp) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
    auto ctx = LayoutTransformContext::make();

    constexpr size_t INP_H = 10, INP_W = 10, N = 16;

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkvar = [&](const char* name, const TensorShape& shp,
                     const DType& dtype) {
        return opr::TypeCvt::make(
                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
                dtype);
    };

    auto x = mkvar("x", {N, 48, INP_H, INP_W},
                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
    float value1 = M_PI, value2 = 0.6;
    auto gen_mat = [&](HostTensorND& mat) {
        auto ptr = mat.ptr<float>();
        for (size_t i = 0; i < N; ++i) {
            auto rot = value1, scale = value2, sheer = value1, dy = value2,
                 dx = value2, ky = value2, kx = value2, kb = value2;
            ptr[0] = ptr[4] = cos(rot) * scale;
            ptr[1] = -(ptr[3] = sin(rot) * scale);
            ptr[3] *= sheer;
            ptr[4] *= sheer;
            ptr[2] = dx;
            ptr[5] = dy;
            ptr[6] = kx;
            ptr[7] = ky;
            ptr[8] = kb;
            ptr += 9;
        }
        mgb_assert(ptr == mat.ptr<float>() + mat.shape().total_nr_elems());
    };
    auto mat_host = std::make_shared<HostTensorND>(
            x.node()->comp_node(), TensorShape{N, 3, 3}, dtype::Float32());
    gen_mat(*mat_host);
    auto mat = opr::Host2DeviceCopy::make(*graph, mat_host).rename("mat");
    TensorShape out_shp{20, 20};
    auto w1 = opr::WarpPerspectiveForward::make(x, mat, out_shp);

    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({w1}, strategy);
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({w1});
    ASSERT_EQ(partitions.size(), 1u);
    using Attribute = Problem::Attribute;
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
    Problem problem(partitions[0], ctx->available_tensor_formats(),
                    ctx->opr_configs(), attribute);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
    const auto& var_rst = rst.var_record;
    EXPECT_TRUE(opr_rst.count(w1.node()->owner_opr()) > 0);
    EXPECT_TRUE(var_rst.count(mat.node()) == 0);
    EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(2)) == 0);
    EXPECT_TRUE(var_rst.count(w1.node()->owner_opr()->input(0)) > 0);
 }

 TEST(TestProfiler, Pooling) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
    auto ctx = LayoutTransformContext::make();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkvar = [&](const char* name, const TensorShape& shp,
                     const DType& dtype) {
        return opr::TypeCvt::make(
                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
                dtype);
    };
    auto x = mkvar("x", {64, 64, 55, 55},
                   dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
    using Param = opr::Pooling::Param;
    Param param;
    param.format = Param::Format::NCHW;
    auto p1 = opr::Pooling::make(x, param);
    x = opr::TypeCvt::make(p1, dtype::QuantizedS8(12.345f));
    auto p2 = opr::Pooling::make(x, param);

    using S = opr::mixin::AlgoChooserHelper::ExecutionPolicy::Strategy;
    S strategy = S::PROFILE;
    gopt::modify_opr_algo_strategy_inplace({p2}, strategy);
    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({p2});
    ASSERT_EQ(partitions.size(), 1u);
    using Attribute = Problem::Attribute;
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
    Problem problem(partitions[0], ctx->available_tensor_formats(),
                    ctx->opr_configs(), attribute);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
    EXPECT_TRUE(opr_rst.count(p1.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(p2.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(x.node()->owner_opr()) > 0);
 }

 TEST(TestProfiler, Elemwise) {
    REQUIRE_GPU(1);
    auto cn = CompNode::load("gpu0");
    cn.activate();
    REQUIRE_CUDA_COMPUTE_CAPABILITY_EQ(7, 5);
    auto ctx = LayoutTransformContext::make();

    HostTensorGenerator<dtype::Int8> gen;
    auto graph = ComputingGraph::make();
    graph->options().graph_opt_level = 0;
    auto mkvar = [&](const char* name, const TensorShape& shp,
                     const DType& dtype) {
        return opr::TypeCvt::make(
                opr::Host2DeviceCopy::make(*graph, gen(shp, cn)).rename(name),
                dtype);
    };
    auto a = mkvar("a", {64, 48, 14, 14}, dtype::Float32());
    auto b = mkvar("b", {1, 48, 1, 1}, dtype::Float32());
    auto c = opr::Elemwise::make({a, b},
                                 {opr::Elemwise::Param::Mode::FUSE_ADD_RELU});
    auto q4c = opr::TypeCvt::make(
            c, dtype::Quantized4Asymm(2.5f, static_cast<uint8_t>(4)));
    auto q8a = mkvar("q8a", {64, 48, 14, 14}, dtype::QuantizedS8(2.5f));
    auto q8b = mkvar("q8b", {64, 48, 14, 14}, dtype::QuantizedS8(1.2f));
    auto q8d = opr::ElemwiseMultiType::make(
            {q8a, q8b}, {opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU},
            OperatorNodeConfig(dtype::QuantizedS8(12.f)));
    auto q4d = opr::TypeCvt::make(
            q8d, dtype::Quantized4Asymm(1.2f, static_cast<uint8_t>(3)));
    auto q4e = opr::ElemwiseMultiType::make(
            {q4c, q4d}, {opr::ElemwiseMultiType::Param::Mode::QADD},
            OperatorNodeConfig(
                    dtype::Quantized4Asymm(13.f, static_cast<uint8_t>(4))));

    using OprFormat = OprTensorFormatsConfiguration::OprFormat;
    SubGraphExtractor extractor(ctx->opr_list());
    auto partitions = extractor.extract({q4e});
    ASSERT_EQ(partitions.size(), 1u);
    using Attribute = Problem::Attribute;
    Attribute attribute = {OprFormat::NCHW, TensorFormats::NCHW};
    Problem problem(partitions[0], ctx->available_tensor_formats(),
                    ctx->opr_configs(), attribute);
    auto profiler = ProfilerBase::make_profiler();
    auto rst = profiler->profile(problem);
    const auto& opr_rst = rst.opr_record;
    const auto& var_rst = rst.var_record;
    EXPECT_TRUE(opr_rst.count(c.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(q8d.node()->owner_opr()) > 0);
    EXPECT_TRUE(opr_rst.count(q4e.node()->owner_opr()) > 0);
    EXPECT_TRUE(var_rst.count(a.node()) > 0);
    EXPECT_TRUE(var_rst.count(b.node()) > 0);
    EXPECT_TRUE(var_rst.count(q8a.node()) > 0);
    EXPECT_TRUE(var_rst.count(q8b.node()) > 0);
 }

 #endif

 // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}
--- a/src/gopt/test/reformat_manager.cpp
+++ b/src/gopt/test/reformat_manager.cpp
@@ -447,8 +447,6 @@ TEST(TestReformatManager, AutoAlignedFeatureProfiling) {
    for (size_t i = 0; i < RUNS; ++i)
        func->execute();
    double time_profiler = profiler->duration() * 1e6;
    printf("%f, %f\n", time_profiler, time_cuda_evt);
    ASSERT_EQ(time_cuda_evt, time_profiler);
    MGB_CUDA_CHECK(cudaEventDestroy(evt0));
    MGB_CUDA_CHECK(cudaEventDestroy(evt1));
 }