diff --git a/python_module/megengine/_internal/__init__.py b/python_module/megengine/_internal/__init__.py
index 95a68965..9201a5da 100644
--- a/python_module/megengine/_internal/__init__.py
+++ b/python_module/megengine/_internal/__init__.py
@@ -539,7 +539,7 @@ def optimize_for_inference(
     f16_io_comp=False,
     use_nhwcd4=False,
     fuse_conv_bias_nonlinearity=False,
-    use_tensor_core=False,
+    use_nchw32=False,
     fuse_conv_bias_with_z=False,
     use_nchw88=False,
     use_nchw44=False
@@ -564,6 +564,8 @@ def optimize_for_inference(
         times.
     :param use_nchw44: whether to use NCHW44 tensor format. This maybe faster some
         times.
+    :param use_nchw32: whether to use NCHW32 tensor format. Mainly used for
+        nvidia tensorcore.
 
 
     :return: list of transformed vars corresponding to given output vars
@@ -575,15 +577,28 @@ def optimize_for_inference(
     for i in [
         "f16_io_f32_comp",
         "f16_io_comp",
-        "use_nhwcd4",
         "fuse_conv_bias_nonlinearity",
-        "use_tensor_core",
         "fuse_conv_bias_with_z",
-        "use_nchw88",
-        "use_nchw44",
     ]:
         if settings[i]:
             getattr(opt, "enable_{}".format(i))()
+
+    layout_tranform = None
+    for k, v in {
+        "use_nhwcd4": "nchw2nhwcd4",
+        "use_nchw32": "nchw2nchw32",
+        "use_nchw88": "nchw2nchw88",
+        "use_nchw44": "nchw2nchw44",
+    }.items():
+        if settings[k]:
+            assert (
+                not layout_tranform
+            ), "Only one layout transform supported, both {} and {}".format(
+                layout_tranform, k
+            )
+            getattr(opt, "enable_{}".format(v))()
+            layout_tranform = k
+
     vec = _detail._VectorSymbolVar()
     for i in output_vars:
         assert isinstance(i, _detail.SymbolVar), "bad var: {}".format(i)
diff --git a/python_module/src/swig/misc.i b/python_module/src/swig/misc.i
index 0b96f763..7b765949 100644
--- a/python_module/src/swig/misc.i
+++ b/python_module/src/swig/misc.i
@@ -71,15 +71,19 @@ class _PersistentCache {
 };
 
 struct _OptimizeForInferenceOptions {
-#define SET(n) void enable_##n()
-    SET(f16_io_f32_comp);
-    SET(f16_io_comp);
-    SET(fuse_conv_bias_nonlinearity);
-    SET(use_nhwcd4);
-    SET(use_tensor_core);
-    SET(fuse_conv_bias_with_z);
-    SET(use_nchw88);
-    SET(use_nchw44);
+#define SET(n)  void enable_##n();
+        SET(f16_io_f32_comp);
+        SET(f16_io_comp);
+        SET(fuse_conv_bias_nonlinearity);
+        SET(fuse_conv_bias_with_z);
+#undef SET
+#define SET(_trans, _trans_capital)   \
+        void enable_##_trans(); \
+
+        SET(nchw2nhwcd4, NCHW2NHWCD4);
+        SET(nchw2nchw88, NCHW2NCHW88);
+        SET(nchw2nchw44, NCHW2NCHW44);
+        SET(nchw2nchw32, NCHW2NCHW32);
 #undef SET
 };
 
diff --git a/sdk/load-and-run/dump_with_testcase_mge.py b/sdk/load-and-run/dump_with_testcase_mge.py
index a87ade52..3d67486b 100755
--- a/sdk/load-and-run/dump_with_testcase_mge.py
+++ b/sdk/load-and-run/dump_with_testcase_mge.py
@@ -255,7 +255,7 @@ def optimize_for_inference(args, outputs):
         'enable_nchw88': 'use_nchw88',
         'enable_nchw44': 'use_nchw44',
         'enable_fuse_conv_bias_nonlinearity': 'fuse_conv_bias_nonlinearity',
-        'enable_tensorcore': 'use_tensor_core',
+        'enable_nchw32': 'use_nchw32',
         'enable_fuse_conv_bias_with_z': 'fuse_conv_bias_with_z',
     }
     kwargs = {}
@@ -393,7 +393,7 @@ def main():
         'for inference'
     )
     parser.add_argument(
-        '--enable-tensorcore',
+        '--enable-nchw32',
         action='store_true',
         help='transform the model format from NCHW4 to NCHW32 '
         'for inference on nvidia TensoCore'
diff --git a/src/gopt/impl/framework.cpp b/src/gopt/impl/framework.cpp
index 2703fffd..a981a84d 100644
--- a/src/gopt/impl/framework.cpp
+++ b/src/gopt/impl/framework.cpp
@@ -642,21 +642,6 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
     add_pass<ArithMulDistributePass>();
     add_pass<ReorderArithChainPass>(cv_type);
 
-    if (inference_opt) {
-        if (inference_opt->use_nhwcd4) {
-            add_pass(ConvertFormatPass::make_nhwcd4_converter());
-        }
-        if (inference_opt->f16_io_f32_comp) {
-            add_pass(ConvertF32ToF16Pass::make(true));
-        }
-        if (inference_opt->f16_io_comp) {
-            add_pass(ConvertF32ToF16Pass::make(false));
-        }
-
-        // fuse again after reordering
-        add_pass<ParamFusePass>();
-    }
-
     add_pass<ArithFusePass>();
     // reorder again because shapes of fused oprs might change
     add_pass<ReorderArithChainPass>(cv_type);
@@ -687,32 +672,7 @@ GraphOptimizer& GraphOptimizer::add_preset_passes(
     }
 #endif
 
-    if (inference_opt) {
-        if (inference_opt->fuse_conv_bias_nonlinearity)
-            add_pass<FuseConvBiasNonlinPass>();
-        if (inference_opt->fuse_conv_bias_with_z) {
-            mgb_assert(inference_opt->fuse_conv_bias_nonlinearity,
-                       "fuse conv bias with z input should fuse conv bias "
-                       "activation "
-                       "first");
-            add_pass<FuseConvBiasZPass>();
-        }
-        if (inference_opt->use_nchw88) {
-            add_pass(EnableNchwxxPass::make_nchwxx_converter(8));
-        }
-        if (inference_opt->use_nchw44) {
-            add_pass(EnableNchwxxPass::make_nchwxx_converter(4));
-        }
-        if (inference_opt->use_tensor_core) {
-            mgb_assert(inference_opt->fuse_conv_bias_nonlinearity,
-                       "enable tensor core should fuse conv bias activation "
-                       "first");
-            add_pass(EnableTensorCorePass::make_tensorcore_converter());
-            add_pass<ShuffleShuffleRemovePass>();
-            add_pass<RemoveRedundantTypeCvtPass>();
-        }
-        add_pass<ParamFusePass>();
-    }
+    apply_optimize_options(inference_opt);
 
     if (inference_opt) {
         // merge params to reduce loading time and graph overhead
@@ -739,6 +699,42 @@ VarNode* GraphOptimizer::var_replace_lookup(VarNode *var) {
     }
 }
 
+void GraphOptimizer::apply_optimize_options(
+        const OptimizeOptions* options) {
+    if (!options) return;
+    if (options->f16_io_comp) {
+        add_pass(ConvertF32ToF16Pass::make(false));
+    }
+    if (options->f16_io_f32_comp) {
+        add_pass(ConvertF32ToF16Pass::make(true));
+    }
+    if (options->transform_nchw2nhwcd4()) {
+        add_pass(ConvertFormatPass::make_nhwcd4_converter());
+        add_pass<FuseConvBiasNonlinPass>();
+    }
+    if (options->transform_nchw2nchw88()) {
+        add_pass(EnableNchwxxPass::make_nchwxx_converter(8));
+    }
+    if (options->transform_nchw2nchw44()) {
+        add_pass(EnableNchwxxPass::make_nchwxx_converter(4));
+    }
+    if (options->transform_nchw2nchw32()) {
+        add_pass<FuseConvBiasNonlinPass>();
+        add_pass(EnableTensorCorePass::make_tensorcore_converter());
+        add_pass<ShuffleShuffleRemovePass>();
+        add_pass<RemoveRedundantTypeCvtPass>();
+    }
+
+    if (options->fuse_conv_bias_nonlinearity) {
+        add_pass<FuseConvBiasNonlinPass>();
+    }
+    if (options->fuse_conv_bias_with_z) {
+        add_pass<FuseConvBiasNonlinPass>();
+        add_pass<FuseConvBiasZPass>();
+    }
+    add_pass<ParamFusePass>();
+}
+
 /* ================ ConstVarPropogateBase ================ */
 
 ConstVarPropogateBase::AddOprResult ConstVarPropogateBase::add_opr(
diff --git a/src/gopt/impl/tensor_reformat.cpp b/src/gopt/impl/tensor_reformat.cpp
index 0579eaa7..c797be79 100644
--- a/src/gopt/impl/tensor_reformat.cpp
+++ b/src/gopt/impl/tensor_reformat.cpp
@@ -1770,7 +1770,7 @@ public:
             return reformat.node();
 
         };
-        
+
         m_reformat[std::make_pair(TensorFormat::CHWN4, TensorFormat::NCHW4)] =
                 [](VarNode* inp) -> VarNode* {
             megdnn::param::RelayoutFormat param;
diff --git a/src/gopt/include/megbrain/gopt/framework.h b/src/gopt/include/megbrain/gopt/framework.h
index 19027251..cd0b3015 100644
--- a/src/gopt/include/megbrain/gopt/framework.h
+++ b/src/gopt/include/megbrain/gopt/framework.h
@@ -377,6 +377,57 @@ namespace gopt {
             RecursiveSubGraphRewriteHelper(OptState &state);
     };
 
+    /**
+     * \brief common optimize options, it both can be used for optimize for
+     * inference in graph dump but also used in graph optimization in runtime.
+     */
+    struct OptimizeOptions {
+        //! whether to enable IO in float16 compute in float32
+        bool f16_io_f32_comp = false;
+        //! whether to enable tranform to pure float16 model
+        bool f16_io_comp = false;
+        //! whether to enable conv bias nonlinearity fusion
+        bool fuse_conv_bias_nonlinearity = false;
+        enum LayoutTransform : uint32_t {
+            DEFAULT,
+            NCHW2NHWCD4,  ///< compute using NHWCD4 tensor format
+            NCHW2NCHW88,  ///< compute using NCHW88 tensor format
+            NCHW2NCHW44,  ///< compute using NCHW44 tensor format
+            NCHW2NCHW32,  ///< compute using NCHW32 tensor format, used for
+                          ///< tensorcore
+        };
+        LayoutTransform layout_transform = LayoutTransform::DEFAULT;
+        //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b)
+        //! + z -> conv_bias(x, w, b, z)
+        bool fuse_conv_bias_with_z = false;
+
+#define SET(n)                      \
+    OptimizeOptions& enable_##n() { \
+        n = true;                   \
+        return *this;               \
+    }
+        SET(f16_io_f32_comp);
+        SET(f16_io_comp);
+        SET(fuse_conv_bias_nonlinearity);
+        SET(fuse_conv_bias_with_z);
+#undef SET
+#define SET(_trans, _trans_capital)                                 \
+    OptimizeOptions& enable_##_trans() {                            \
+        layout_transform = LayoutTransform::_trans_capital;         \
+        return *this;                                               \
+    }                                                               \
+    bool transform_##_trans() const {                               \
+        return layout_transform == LayoutTransform::_trans_capital; \
+    }
+
+        SET(nchw2nhwcd4, NCHW2NHWCD4);
+        SET(nchw2nchw88, NCHW2NCHW88);
+        SET(nchw2nchw44, NCHW2NCHW44);
+        SET(nchw2nchw32, NCHW2NCHW32);
+#undef SET
+    };
+
+
     /*!
      * \brief manage passes and their applying on graphs
      *
@@ -465,6 +516,11 @@ namespace gopt {
              *      var_replace_map(var->owner_graph()) corresponding to var
              */
             static VarNode* var_replace_lookup(VarNode *var);
+
+            /**
+             * \brief apply optimize options
+             */
+            void apply_optimize_options(const OptimizeOptions* options);
     };
 
     /*!
diff --git a/src/gopt/include/megbrain/gopt/inference.h b/src/gopt/include/megbrain/gopt/inference.h
index af86d61b..773d05ec 100644
--- a/src/gopt/include/megbrain/gopt/inference.h
+++ b/src/gopt/include/megbrain/gopt/inference.h
@@ -256,40 +256,7 @@ namespace gopt {
                 size_t pack_c_size);
     };
 
-    struct OptimizeForInferenceOptions {
-        //! whether to enable IO in float16 compute in float32
-        bool f16_io_f32_comp = false;
-        //! whether to enable tranform to pure float16 model
-        bool f16_io_comp = false;
-        //! whether to enable conv bias nonlinearity fusion
-        bool fuse_conv_bias_nonlinearity = false;
-        //! whether to compute using NHWCD4 tensor format
-        bool use_nhwcd4 = false;
-        //! whether to compute using NCHW88 tensor format
-        bool use_nchw88 = false;
-        //! whether to compute using NCHW44 tensor format
-        bool use_nchw44 = false;
-        //! whether to enable tensor core
-        bool use_tensor_core = false;
-        //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b)
-        //! + z -> conv_bias(x, w, b, z)
-        bool fuse_conv_bias_with_z = false;
-
-#define SET(n)                                  \
-    OptimizeForInferenceOptions& enable_##n() { \
-        n = true;                               \
-        return *this;                           \
-    }
-        SET(f16_io_f32_comp);
-        SET(f16_io_comp);
-        SET(fuse_conv_bias_nonlinearity);
-        SET(use_nhwcd4);
-        SET(use_tensor_core);
-        SET(fuse_conv_bias_with_z);
-        SET(use_nchw88);
-        SET(use_nchw44);
-#undef SET
-    };
+    struct OptimizeForInferenceOptions : OptimizeOptions {};
 
     /*!
      * \brief optimize a computing graph for inference
diff --git a/src/gopt/test/inference.cpp b/src/gopt/test/inference.cpp
index 7d6456ac..b271b764 100644
--- a/src/gopt/test/inference.cpp
+++ b/src/gopt/test/inference.cpp
@@ -635,10 +635,9 @@ TEST(TestGoptInference, Float16IOFloat32Compute) {
     y = opr::Concat::make({y, -y}, 0);
     y = opr::Reduce::make(y, {}, y.make_scalar(1));
     SymbolVar y_opt;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_f16_io_f32_comp()),
-                  y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_f16_io_f32_comp();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
     ASSERT_EQ(y_opt.dtype(), dtype::Float32());
 
     HostTensorND host_y, host_y_opt;
@@ -683,10 +682,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeWarpPerspective) {
     TensorShape out_shp{20, 20};
     auto y = opr::WarpPerspective::make(a, mat, out_shp);
     SymbolVar y_opt;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_f16_io_f32_comp()),
-                  y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_f16_io_f32_comp();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
     ASSERT_EQ(y_opt.dtype(), dtype::Float32());
     HostTensorND host_y, host_y_opt;
     auto func = graph->compile({make_callback_copy(y, host_y),
@@ -723,10 +721,9 @@ TEST(TestGoptInference, Float16IOFloat32ComputeRemap) {
     auto map = opr::Host2DeviceCopy::make(*graph, map_host).rename("map");
     auto y = opr::Remap::make(a, map);
     SymbolVar y_opt;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_f16_io_f32_comp()),
-                  y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_f16_io_f32_comp();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
     ASSERT_EQ(y_opt.dtype(), dtype::Float32());
     HostTensorND host_y, host_y_opt;
     auto func = graph->compile({make_callback_copy(y, host_y),
@@ -770,10 +767,9 @@ TEST(TestGoptInference, Uint8IOFloat16ComputeWarpPerspective) {
     TensorShape out_shp{20, 20};
     auto y = opr::WarpPerspective::make(a, mat, out_shp);
     SymbolVar y_opt;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_f16_io_comp()),
-                  y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_f16_io_comp();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
     ASSERT_EQ(y_opt.dtype(), dtype::Uint8());
     HostTensorND host_y, host_y_opt;
     auto func = graph->compile({make_callback_copy(y, host_y),
@@ -801,10 +797,9 @@ TEST(TestGoptInference, Float32TOFloat16) {
         y = opr::Reduce::make(y, {}, y.make_scalar(1));
 
         SymbolVar y_opt;
-        unpack_vector(gopt::optimize_for_inference(
-                              {y}, gopt::OptimizeForInferenceOptions{}
-                                           .enable_f16_io_comp()),
-                      y_opt);
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_f16_io_comp();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
         return y_opt;
     };
 
@@ -857,10 +852,9 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
         auto y = d0 + b;
 
         SymbolVar y_opt;
-        unpack_vector(gopt::optimize_for_inference(
-                              {y}, gopt::OptimizeForInferenceOptions{}
-                                           .enable_f16_io_comp()),
-                      y_opt);
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_f16_io_comp();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
         return y_opt;
     };
 
@@ -897,7 +891,7 @@ TEST(TestGoptInference, Float32TOFloat16EndpointElemwise) {
 TEST(TestGoptInference, Float32TOFloat16Linspace) {
     CompNode cn = CompNode::load("cpu0");
     HostTensorGenerator<> gen(0, 1, 0);
-    auto host_x = gen({3, 1}, cn); 
+    auto host_x = gen({3, 1}, cn);
     auto graph = ComputingGraph::make();
 
     auto make_f32_to_f16_graph = [&]() {
@@ -916,10 +910,9 @@ TEST(TestGoptInference, Float32TOFloat16Linspace) {
         auto mm = opr::MatrixMul::make(x, y);
 
         SymbolVar mm_opt;
-        unpack_vector(gopt::optimize_for_inference(
-                              {mm}, gopt::OptimizeForInferenceOptions{}
-                                            .enable_f16_io_comp()),
-                      mm_opt);
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_f16_io_comp();
+        unpack_vector(gopt::optimize_for_inference({mm}, options), mm_opt);
         return mm_opt;
     };
 
@@ -998,11 +991,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4) {
          y = opr::Convolution::make(elem, w2, param);
 
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
-            y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nhwcd4();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
 
     ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
               find_opr<opr::Convolution>(y_opt).param().format);
@@ -1059,11 +1050,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4LOCAL) {
          y = opr::Convolution::make(group_local, w5, param);
 
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
-            y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nhwcd4();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
 
     ASSERT_EQ(opr::Convolution::Param::Format::NHWCD4,
               find_opr<opr::Convolution>(y_opt).param().format);
@@ -1112,11 +1101,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Deconv) {
          y = opr::ConvolutionBackwardData::make(w1, conv, param, {}, {});
 
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
-            y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nhwcd4();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
 
     ASSERT_EQ(opr::Convolution::Param::Format::NCHW,
               find_opr<opr::ConvolutionBackwardData>(y_opt).param().format);
@@ -1159,11 +1146,9 @@ TEST(TestGoptInference, ConvertFormatNHWCD4Qint8) {
                  OperatorNodeConfig{dtype::QuantizedS8(0.2f)});
 
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
-            y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nhwcd4();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
 
     ASSERT_EQ(opr::ConvBias::Param::Format::NHWCD4,
               find_opr<opr::ConvBias>(y_opt).param().format);
@@ -1213,11 +1198,9 @@ TEST(TestGoptInference, ConvertFormatPadIC) {
     auto w1 = mkcvar("w1", {12, 12, 3, 3});
     auto y = opr::Convolution::make(concat, w1, param);
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nhwcd4()),
-            y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nhwcd4();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
 
     HostTensorND host_y_opt, host_y;
     auto func = graph->compile({make_callback_copy(y, host_y),
@@ -1301,11 +1284,9 @@ TEST(TestGoptInference, ConvBiasNonlinearityFusePass) {
                  opr::Elemwise::make({y_cut}, opr::Elemwise::Param::Mode::RELU),
          y_y = opr::Convolution::make(y_expand, w3, param), y = y_y + y_tmp;
     SymbolVar y_opt;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_use_nhwcd4()
-                                       .enable_fuse_conv_bias_nonlinearity()),
-                  y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nhwcd4().enable_fuse_conv_bias_nonlinearity();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
     ASSERT_EQ(3u, find_opr<opr::ConvBias>(y_opt).input().size());
     graph->compile({{y_opt, {}}})
             ->to_json()
@@ -1533,15 +1514,16 @@ TEST(TestEnableTensorCore, SmallInputShape) {
 
     SymbolVar y_opt;
     SymbolVar y_no_tc;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()
-                                        .enable_use_tensor_core()),
-                  y_opt);
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()),
-                  y_no_tc);
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_nchw2nchw32().enable_fuse_conv_bias_nonlinearity();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+    }
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
+    }
     auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
     ASSERT_EQ(2u, nr_dimshuffle);
     HostTensorND host_y, host_y_opt;
@@ -1597,15 +1579,16 @@ TEST(TestEnableTensorCore, ConvBiasWithZ) {
 
     SymbolVar y_opt;
     SymbolVar y_no_tc;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()
-                                        .enable_use_tensor_core()),
-                  y_opt);
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()),
-                  y_no_tc);
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+    }
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
+    }
     HostTensorND host_y, host_y_opt;
     auto func = graph->compile({make_callback_copy(y_no_tc, host_y),
                                 make_callback_copy(y_opt, host_y_opt)});
@@ -1664,15 +1647,16 @@ TEST(TestGoptInference, EnableTensorCore) {
     y4 = opr::TypeCvt::make(y4, dtype::Float32());
     SymbolVar y_opt;
     SymbolVar y_no_tc;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y4}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()
-                                        .enable_use_tensor_core()),
-                  y_opt);
-    unpack_vector(gopt::optimize_for_inference(
-                          {y4}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()),
-                  y_no_tc);
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
+        unpack_vector(gopt::optimize_for_inference({y4}, options), y_opt);
+    }
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
+        unpack_vector(gopt::optimize_for_inference({y4}, options), y_no_tc);
+    }
     auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
     ASSERT_EQ(3u, nr_dimshuffle);
     graph->compile({{y_opt, {}}})
@@ -1763,15 +1747,17 @@ TEST(FuseConvBiasZPass, BlockFuse) {
 
     SymbolVar z_fuse;
     SymbolVar z_nonfuse;
-    unpack_vector(gopt::optimize_for_inference(
-                          {z}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_fuse_conv_bias_nonlinearity()
-                                       .enable_fuse_conv_bias_with_z()),
-                  z_fuse);
-    unpack_vector(gopt::optimize_for_inference(
-                          {z4}, gopt::OptimizeForInferenceOptions{}
-                                       .enable_fuse_conv_bias_nonlinearity()),
-                  z_nonfuse);
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity()
+                .enable_fuse_conv_bias_with_z();
+        unpack_vector(gopt::optimize_for_inference({z}, options), z_fuse);
+    }
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity();
+        unpack_vector(gopt::optimize_for_inference({z4}, options), z_nonfuse);
+    }
     auto nr_elem_multi_type = find_opr_num<mgb::opr::ElemwiseMultiType>(z_fuse);
     MGB_MARK_USED_VAR(nr_elem_multi_type);
     ASSERT_EQ(1u, nr_elem_multi_type);
@@ -1867,15 +1853,16 @@ TEST(TestEnableTensorCore, ShuffleMerge) {
 
     SymbolVar y_opt;
     SymbolVar y_no_tc;
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()
-                                        .enable_use_tensor_core()),
-                  y_opt);
-    unpack_vector(gopt::optimize_for_inference(
-                          {y}, gopt::OptimizeForInferenceOptions{}
-                                        .enable_fuse_conv_bias_nonlinearity()),
-                  y_no_tc);
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity().enable_nchw2nchw32();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+    }
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_no_tc);
+    }
     auto nr_dimshuffle = find_opr_num<mgb::opr::Dimshuffle>(y_opt);
     ASSERT_EQ(3u, nr_dimshuffle);
     HostTensorND host_y, host_y_opt;
@@ -1932,13 +1919,13 @@ TEST(FuseConvBiasZPass, Basic) {
                       opr::ElemwiseMultiType::Param::Mode::QFUSE_ADD_RELU}) {
         auto y1 = opr::ElemwiseMultiType::make(
                 {y, b1}, {mode}, OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
-        unpack_vector(
-                gopt::optimize_for_inference(
-                        {y1}, gopt::OptimizeForInferenceOptions{}
-                                      .enable_fuse_conv_bias_nonlinearity()
-                                      .enable_fuse_conv_bias_with_z()
-                                      .enable_use_tensor_core()),
-                y_opt);
+        {
+            auto options = gopt::OptimizeForInferenceOptions{};
+            options.enable_fuse_conv_bias_nonlinearity()
+                    .enable_fuse_conv_bias_with_z()
+                    .enable_nchw2nchw32();
+            unpack_vector(gopt::optimize_for_inference({y1}, options), y_opt);
+        }
         auto nr_elemwisemultitype = find_opr_num<opr::ElemwiseMultiType>(y_opt);
         if (mode == opr::ElemwiseMultiType::Param::Mode::QMUL) {
             ASSERT_NE(0u, nr_elemwisemultitype);
@@ -1949,13 +1936,14 @@ TEST(FuseConvBiasZPass, Basic) {
             auto y2 = opr::ElemwiseMultiType::make(
                     {y1, b2}, {mode},
                     OperatorNodeConfig{dtype::QuantizedS8(2.5f)});
-            unpack_vector(
-                    gopt::optimize_for_inference(
-                            {y2}, gopt::OptimizeForInferenceOptions{}
-                                          .enable_fuse_conv_bias_nonlinearity()
-                                          .enable_fuse_conv_bias_with_z()
-                                          .enable_use_tensor_core()),
-                    y_opt);
+            {
+                auto options = gopt::OptimizeForInferenceOptions{};
+                options.enable_fuse_conv_bias_nonlinearity()
+                        .enable_fuse_conv_bias_with_z()
+                        .enable_nchw2nchw32();
+                unpack_vector(gopt::optimize_for_inference({y2}, options),
+                              y_opt);
+            }
             auto nr_elemwisemultitype =
                     find_opr_num<opr::ElemwiseMultiType>(y_opt);
             ASSERT_NE(0u, nr_elemwisemultitype);
@@ -2401,11 +2389,11 @@ TEST(TestGoptInference, ConvertFormatNCHW88) {
          y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
 
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nchw88()),
-            y_opt);
+    {
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_nchw2nchw88();
+        unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
+    }
 
     ASSERT_EQ(opr::ConvBias::Param::Format::NCHW88,
               find_opr<opr::ConvBias>(y_opt).param().format);
@@ -2483,11 +2471,9 @@ TEST(TestGoptInference, ConvertFormatNCHW44) {
          y = opr::ConvBias::make(conv5, w6, b6, param_conv_bias);
 
     SymbolVar y_opt;
-    unpack_vector(
-            gopt::optimize_for_inference(
-                    {y},
-                    gopt::OptimizeForInferenceOptions{}.enable_use_nchw44()),
-            y_opt);
+    auto options = gopt::OptimizeForInferenceOptions{};
+    options.enable_nchw2nchw44();
+    unpack_vector(gopt::optimize_for_inference({y}, options), y_opt);
 
     ASSERT_EQ(opr::ConvBias::Param::Format::NCHW44,
               find_opr<opr::ConvBias>(y_opt).param().format);
diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp
index 8eecfaa6..f28509ef 100644
--- a/src/opr/test/dnn/convolution.cpp
+++ b/src/opr/test/dnn/convolution.cpp
@@ -495,7 +495,7 @@ TEST(TestOprDNN, ConvolutionBackwardFilter) {
                            Param{Mode::CROSS_CORRELATION, PH, PW, SH, SW});
         dest[0] = *out;
     };
-     
+
 #define get_shp(N, P, S, F) ((N + 2 * P - F) / S + 1)
 #define inp_tensor(N, IC, OC, IH, IW, FH, FW) \
     { TensorShape{N, IC, IH, IW}, \
@@ -1282,9 +1282,10 @@ TEST(TestOprDNN, ConvBiasINT8x8xX_NCHW4) {
                     *graph, inp[i]);
         }
 
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_fuse_conv_bias_nonlinearity();
         auto y = gopt::optimize_for_inference({make_graph(inputs)[0]},
-                gopt::OptimizeForInferenceOptions{}.enable_fuse_conv_bias_nonlinearity())[0];
-                //gopt::OptimizeForInferenceOptions{})[0];
+                                              options)[0];
         auto func = graph->compile({make_callback_copy(y, dest[0])});
         func->execute();
         func->wait();
@@ -1720,7 +1721,7 @@ TEST(TestOprDNN, DeformableConvForward) {
                 }
             };
             //! generate offset to avoid value near integer
-	    /// because bilinear function is not derivable over there  
+	    /// because bilinear function is not derivable over there
 	    checker.set_input_generator(2, gen_off);
             checker.set_input_dtype(0, dtype::Float32());
             checker.set_input_dtype(1, dtype::Float32());
diff --git a/src/opr/test/io.cpp b/src/opr/test/io.cpp
index 08b60cdb..7bff8f77 100644
--- a/src/opr/test/io.cpp
+++ b/src/opr/test/io.cpp
@@ -500,10 +500,10 @@ TEST(TestOprIO, MultipleDeviceTensorWithFormatHolderCpu) {
              conv2 = opr::Convolution::make(conv1, w2, param);
 
         auto y = opr::Elemwise::make({conv2}, opr::Elemwise::Param::Mode::RELU);
-        SymbolVar y_opt = gopt::optimize_for_inference(
-                                  {y}, gopt::OptimizeForInferenceOptions{}
-                                               .enable_use_nhwcd4())[0]
-                                  .rename("out");
+        auto options = gopt::OptimizeForInferenceOptions{};
+        options.enable_nchw2nhwcd4();
+        SymbolVar y_opt =
+                gopt::optimize_for_inference({y}, options)[0].rename("out");
 
         auto dumper = serialization::GraphDumper::make(
                 serialization::OutputFile::make_fs(fname.c_str()));