diff --git a/dnn/test/common/opr_algo_proxy.h b/dnn/test/common/opr_algo_proxy.h
index fa362756..b2a3d953 100644
--- a/dnn/test/common/opr_algo_proxy.h
+++ b/dnn/test/common/opr_algo_proxy.h
@@ -6,7 +6,8 @@
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ * implied.
  */
 #pragma once
 
@@ -20,36 +21,126 @@ namespace test {
 template <typename Opr, size_t Arity>
 struct AlgoProxy;
 
-template <typename Opr>
-struct AlgoProxy<Opr, 3> {
-    static std::vector<typename Opr::AlgorithmInfo> get_all_algorithms_info(
-            Opr* opr, TensorLayoutArray& layouts) {
-        megdnn_assert(layouts.size() == 3);
-        return opr->get_all_algorithms_info(layouts[0], layouts[1], layouts[2]);
-    }
-    static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(
-            Opr* opr, TensorLayoutArray& layouts) {
-        megdnn_assert(layouts.size() == 3);
-        return opr->get_algorithm_info_heuristic(layouts[0], layouts[1],
-                                                 layouts[2]);
+#define DEF_ALGO_PROXY(arity)                                                 \
+    template <typename Opr>                                                   \
+    struct AlgoProxy<Opr, arity> {                                            \
+        static std::vector<typename Opr::AlgorithmInfo>                       \
+        get_all_algorithms_info(Opr* opr, const TensorLayoutArray& layouts) { \
+            megdnn_assert(layouts.size() == arity);                           \
+            return opr->get_all_algorithms_info(LAYOUTS);                     \
+        }                                                                     \
+        static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(      \
+                Opr* opr, const TensorLayoutArray& layouts) {                 \
+            megdnn_assert(layouts.size() == arity);                           \
+            return opr->get_algorithm_info_heuristic(LAYOUTS);                \
+        }                                                                     \
+        static size_t get_workspace_in_bytes(                                 \
+                Opr* opr, const TensorLayoutArray& layouts) {                 \
+            megdnn_assert(layouts.size() == arity);                           \
+            return opr->get_workspace_in_bytes(LAYOUTS);                      \
+        }                                                                     \
+        static void exec(Opr* opr, const TensorNDArray& tensors,              \
+                         Workspace workspace) {                               \
+            megdnn_assert(tensors.size() == arity);                           \
+            return opr->exec(TENSORS, workspace);                             \
+        }                                                                     \
     }
-};
 
-template <typename Opr>
-struct AlgoProxy<Opr, 5> {
-    static std::vector<typename Opr::AlgorithmInfo> get_all_algorithms_info(
-            Opr* opr, TensorLayoutArray& layouts) {
-        megdnn_assert(layouts.size() == 5);
-        return opr->get_all_algorithms_info(layouts[0], layouts[1], layouts[2],
-                                            layouts[3], layouts[4]);
-    }
-    static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(
-            Opr* opr, TensorLayoutArray& layouts) {
-        megdnn_assert(layouts.size() == 5);
-        return opr->get_algorithm_info_heuristic(
-                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]);
-    }
-};
+#define LAYOUTS layouts[0], layouts[1], layouts[2]
+#define TENSORS tensors[0], tensors[1], tensors[2]
+DEF_ALGO_PROXY(3);
+#undef LAYOUTS
+#undef TENSORS
+
+#define LAYOUTS layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]
+#define TENSORS tensors[0], tensors[1], tensors[2], tensors[3], tensors[4]
+DEF_ALGO_PROXY(5);
+#undef LAYOUTS
+#undef TENSORS
+
+#define LAYOUTS                                                             \
+    layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5], \
+            layouts[6], layouts[7]
+#define TENSORS                                                             \
+    tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5], \
+            tensors[6], tensors[7]
+DEF_ALGO_PROXY(8);
+#undef LAYOUTS
+#undef TENSORS
+
+#undef DEF_ALGO_PROXY
+
+#define DEF_ALGO_PROXY(Opr, arity)                                             \
+    template <>                                                                \
+    struct AlgoProxy<Opr, arity> {                                             \
+        static std::vector<typename Opr::AlgorithmInfo>                        \
+        get_all_algorithms_info(Opr* opr, const TensorLayoutArray& layouts) {  \
+            megdnn_assert(layouts.size() == arity);                            \
+            return opr->get_all_algorithms_info(LAYOUTS);                      \
+        }                                                                      \
+        static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(       \
+                Opr* opr, const TensorLayoutArray& layouts) {                  \
+            megdnn_assert(layouts.size() == arity);                            \
+            return opr->get_algorithm_info_heuristic(LAYOUTS);                 \
+        }                                                                      \
+        static size_t get_workspace_in_bytes(                                  \
+                Opr* opr, const TensorLayoutArray& layouts,                    \
+                const typename Opr::PreprocessedFilter* preprocessed_filter =  \
+                        nullptr) {                                             \
+            megdnn_assert(layouts.size() == arity);                            \
+            return opr->get_workspace_in_bytes(LAYOUTS, preprocessed_filter);  \
+        }                                                                      \
+        static void exec(                                                      \
+                Opr* opr, const TensorNDArray& tensors,                        \
+                const typename Opr::PreprocessedFilter* preprocessed_filter,   \
+                Workspace workspace) {                                         \
+            megdnn_assert(tensors.size() == arity);                            \
+            return opr->exec(TENSORS, preprocessed_filter, workspace);         \
+        }                                                                      \
+        static void exec(Opr* opr, const TensorNDArray& tensors,               \
+                         Workspace workspace) {                                \
+            megdnn_assert(tensors.size() == arity);                            \
+            return opr->exec(TENSORS, nullptr, workspace);                     \
+        }                                                                      \
+        static size_t get_preprocess_workspace_in_bytes(                       \
+                Opr* opr, const TensorLayoutArray& layouts) {                  \
+            megdnn_assert(layouts.size() == arity);                            \
+            return opr->get_preprocess_workspace_in_bytes(LAYOUTS);            \
+        }                                                                      \
+        static SmallVector<TensorLayout> deduce_preprocessed_filter_layout(    \
+                Opr* opr, const TensorLayoutArray& layouts) {                  \
+            megdnn_assert(layouts.size() == arity);                            \
+            return opr->deduce_preprocessed_filter_layout(LAYOUTS);            \
+        }                                                                      \
+        static void exec_preprocess(                                           \
+                Opr* opr, const TensorNDArray& tensors,                        \
+                const TensorLayoutArray& layouts,                              \
+                Opr::PreprocessedFilter* preprocessed_filter,                  \
+                _megdnn_workspace workspace) {                                 \
+            megdnn_assert(layouts.size() == arity && tensors.size() == arity); \
+            return opr->exec_preprocess(PREPROCESS_ARGS, preprocessed_filter,  \
+                                        workspace);                            \
+        }                                                                      \
+    };
+
+#define LAYOUTS layouts[0], layouts[1], layouts[2]
+#define TENSORS tensors[0], tensors[1], tensors[2]
+#define PREPROCESS_ARGS layouts[0], tensors[1], layouts[2]
+DEF_ALGO_PROXY(ConvolutionForward, 3);
+#undef PREPROCESS_ARGS
+#undef LAYOUTS
+#undef TENSORS
+
+#define LAYOUTS layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]
+#define TENSORS tensors[0], tensors[1], tensors[2], tensors[3], tensors[4]
+#define PREPROCESS_ARGS \
+    layouts[0], tensors[1], tensors[2], layouts[3], layouts[4]
+DEF_ALGO_PROXY(ConvBias, 5);
+#undef PREPROCESS_ARGS
+#undef LAYOUTS
+#undef TENSORS
+
+#undef DEF_ALGO_PROXY
 
 template <typename Opr, size_t arity = OprTrait<Opr>::arity>
 struct OprAlgoProxyDefaultImpl : public AlgoProxy<Opr, arity> {};
diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h
index 5d587415..a3ef862e 100644
--- a/dnn/test/common/opr_proxy.h
+++ b/dnn/test/common/opr_proxy.h
@@ -14,6 +14,7 @@
 #include "test/common/deduce_layout_proxy.h"
 #include "test/common/exec_proxy.h"
 #include "test/common/inspect_type.h"
+#include "test/common/opr_algo_proxy.h"
 #include "test/common/opr_trait.h"
 #include "test/common/timer.h"
 #include "test/common/workspace_wrapper.h"
@@ -166,104 +167,33 @@ struct OprProxyProfilingBase
         }
         return ret;
     }
-};
 
-template <class Opr>
-struct OprProxyProfilingTernary : public OprProxyProfilingBase<Opr, 3> {
-    using Base = OprProxyProfilingBase<Opr, 3>;
-    using OprProxyProfilingBase<Opr, 3>::OprProxyProfilingBase;
     void exec(Opr* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 3);
-        if (!Base::W.valid()) {
-            Base::W = WorkspaceWrapper(opr->handle(), 0);
+        megdnn_assert(tensors.size() == arity);
+        if (!W.valid()) {
+            W = WorkspaceWrapper(opr->handle(), 0);
         }
-        if (Base::m_profiling && !Base::target_algo_info.valid()) {
-            size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(tensors[0].layout,
-                                                          tensors[1].layout,
-                                                          tensors[2].layout)) {
-                opr->execution_policy().algo = algo;
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout,
-                        tensors[2].layout);
-                Base::W.update(workspace_size);
-
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2],
-                              Base::W.workspace());
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                Timer timer;
-                timer.start();
-                for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2],
-                              Base::W.workspace());
-                }
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                timer.stop();
-                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
-                       algo.name.c_str());
-                if (min_time > timer.get_time_in_us()) {
-                    min_time = timer.get_time_in_us();
-                    Base::target_algo_info = algo;
-                }
-            }
-            opr->execution_policy().algo = Base::target_algo_info;
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout);
-            Base::W.update(workspace_size);
+        TensorLayoutArray layouts;
+        for (auto&& tensor : tensors) {
+            layouts.push_back(tensor.layout);
         }
-        if (!Base::target_algo_info.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout);
-            Base::W.update(workspace_size);
-        }
-        opr->exec(tensors[0], tensors[1], tensors[2], Base::W.workspace());
-    }
-};
-
-#define DEF_PROF3(c)                                                 \
-    template <>                                                      \
-    struct OprProxy<c> : public OprProxyProfilingTernary<c> {        \
-        using OprProxyProfilingTernary<c>::OprProxyProfilingTernary; \
-    }
-
-DEF_PROF3(ConvolutionBackwardData);
-DEF_PROF3(ConvolutionBackwardFilter);
-DEF_PROF3(LocalShareForward);
-DEF_PROF3(LocalShareBackwardData);
-DEF_PROF3(LocalShareBackwardFilter);
-#undef DEF_PROF3
-
-template <>
-struct OprProxy<ConvolutionForward>
-        : public OprProxyProfilingTernary<ConvolutionForward> {
-    using OprProxyProfilingTernary<
-            ConvolutionForward>::OprProxyProfilingTernary;
-    void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 3);
-        if (!Base::W.valid()) {
-            Base::W = WorkspaceWrapper(opr->handle(), 0);
-        }
-        if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
+        if (m_profiling && !target_algo_info.valid()) {
             size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(tensors[0].layout,
-                                                          tensors[1].layout,
-                                                          tensors[2].layout)) {
+            for (auto algo :
+                 AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
                 opr->execution_policy().algo = algo;
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        nullptr);
-                Base::W.update(workspace_size);
+                auto workspace_size =
+                        AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr,
+                                                                      layouts);
+                W.update(workspace_size);
 
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
-                              Base::W.workspace());
+                for (size_t times = 0; times < warmup_times; ++times)
+                    AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
                 megcoreSynchronize(opr->handle()->megcore_computing_handle());
                 Timer timer;
                 timer.start();
-                for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
-                              Base::W.workspace());
+                for (size_t times = 0; times < exec_times; ++times) {
+                    AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
                 }
                 megcoreSynchronize(opr->handle()->megcore_computing_handle());
                 timer.stop();
@@ -271,286 +201,86 @@ struct OprProxy<ConvolutionForward>
                        algo.name.c_str());
                 if (min_time > timer.get_time_in_us()) {
                     min_time = timer.get_time_in_us();
-                    Base::target_algo_info = algo;
+                    target_algo_info = algo;
                 }
             }
-            opr->execution_policy().algo = Base::target_algo_info;
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    nullptr);
-            Base::W.update(workspace_size);
+            opr->execution_policy().algo = target_algo_info;
+            auto workspace_size =
+                    AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
+            W.update(workspace_size);
         }
-        if (!Base::target_algo_info.desc.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    nullptr);
-            Base::W.update(workspace_size);
+        if (!target_algo_info.valid()) {
+            auto workspace_size =
+                    AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
+            W.update(workspace_size);
         }
-        opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
-                  Base::W.workspace());
+        AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
     }
 };
 
-template <>
-struct OprWeightPreprocessProxy<ConvolutionForward>
-        : public OprProxyProfilingTernary<ConvolutionForward> {
-    using OprProxyProfilingTernary<
-            ConvolutionForward>::OprProxyProfilingTernary;
-    void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 3);
-        if (!Base::W.valid()) {
-            Base::W = WorkspaceWrapper(opr->handle(), 0);
-        }
-        if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
-            size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(tensors[0].layout,
-                                                          tensors[1].layout,
-                                                          tensors[2].layout)) {
-                opr->execution_policy().algo = algo;
-
-                auto preprocess_tensors =
-                        weight_prerocess(opr, tensors, algo.desc);
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                ConvolutionForward::PreprocessedFilter preprocessed_filter{
-                        nullptr, *preprocess_tensors};
+#define DEF_PROF(c, arity)                                            \
+    template <>                                                       \
+    struct OprProxy<c> : public OprProxyProfilingBase<c, arity> {     \
+        using OprProxyProfilingBase<c, arity>::OprProxyProfilingBase; \
+    }
 
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        &preprocessed_filter);
-                Base::W.update(workspace_size);
+DEF_PROF(ConvolutionForward, 3);
+DEF_PROF(ConvolutionBackwardData, 3);
+DEF_PROF(ConvolutionBackwardFilter, 3);
+DEF_PROF(LocalShareForward, 3);
+DEF_PROF(LocalShareBackwardData, 3);
+DEF_PROF(LocalShareBackwardFilter, 3);
 
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2],
-                              &preprocessed_filter, Base::W.workspace());
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                Timer timer;
-                timer.start();
-                for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2],
-                              &preprocessed_filter, Base::W.workspace());
-                }
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                timer.stop();
-                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
-                       algo.name.c_str());
-                if (min_time > timer.get_time_in_us()) {
-                    min_time = timer.get_time_in_us();
-                    Base::target_algo_info = algo;
-                }
-            }
-            opr->execution_policy().algo = Base::target_algo_info;
-            auto preprocess_tensors =
-                    weight_prerocess(opr, tensors, Base::target_algo_info.desc);
-            megcoreSynchronize(opr->handle()->megcore_computing_handle());
-            ConvolutionForward::PreprocessedFilter preprocessed_filter{
-                    nullptr, *preprocess_tensors};
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    &preprocessed_filter);
-            Base::W.update(workspace_size);
-        }
-        auto preprocess_tensors =
-                weight_prerocess(opr, tensors, Base::target_algo_info.desc);
-        megcoreSynchronize(opr->handle()->megcore_computing_handle());
-        ConvolutionForward::PreprocessedFilter preprocessed_filter{
-                nullptr, *preprocess_tensors};
-        if (!Base::target_algo_info.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    &preprocessed_filter);
-            Base::W.update(workspace_size);
-        }
-        opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter,
-                  Base::W.workspace());
-    }
+DEF_PROF(DeformableConvForward, 5);
+DEF_PROF(DeformableConvBackwardFilter, 5);
+DEF_PROF(BatchConvBiasForward, 5);
+DEF_PROF(ConvBiasForward, 5);
 
-    //! handle weight preprocess
-    std::shared_ptr<TensorNDArray> weight_prerocess(
-            ConvolutionForward* opr, const TensorNDArray& tensors,
-            const ConvolutionForward::AlgorithmDesc&) {
-        auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout(
-                tensors[0].layout, tensors[1].layout, tensors[2].layout);
-        auto preprocessed_filter_tensors_ptr =
-                alloc_tensors(opr->handle(), weight_perprocess_layouts);
-        ConvolutionForward::PreprocessedFilter preprocessed_filter{
-                nullptr, *preprocessed_filter_tensors_ptr};
-        size_t preprocess_workspace_size =
-                opr->get_preprocess_workspace_in_bytes(tensors[0].layout,
-                                                       tensors[1].layout,
-                                                       tensors[2].layout);
-        WorkspaceWrapper preprocess_workspace(opr->handle(),
-                                              preprocess_workspace_size);
-        opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout,
-                             &preprocessed_filter,
-                             preprocess_workspace.workspace());
-        return preprocessed_filter_tensors_ptr;
-    }
-};
+DEF_PROF(DeformableConvBackwardData, 8);
+#undef DEF_PROF
 
-template <class Opr>
-struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
-    using Base = OprProxyProfilingBase<Opr, 5>;
-    using OprProxyProfilingBase<Opr, 5>::OprProxyProfilingBase;
+template <class Opr, int arity>
+struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
+    using Base = OprProxyProfilingBase<Opr, arity>;
     void exec(Opr* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 5);
+        megdnn_assert(tensors.size() == arity);
         if (!Base::W.valid()) {
             Base::W = WorkspaceWrapper(opr->handle(), 0);
         }
-        if (Base::m_profiling && !Base::target_algo_info.valid()) {
-            size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(
-                         tensors[0].layout, tensors[1].layout,
-                         tensors[2].layout, tensors[3].layout,
-                         tensors[4].layout)) {
-                opr->execution_policy().algo = algo;
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        tensors[3].layout, tensors[4].layout);
-                Base::W.update(workspace_size);
-
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], Base::W.workspace());
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                Timer timer;
-                timer.start();
-                for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], Base::W.workspace());
-                }
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                timer.stop();
-                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
-                       algo.name.c_str());
-                if (min_time > timer.get_time_in_us()) {
-                    min_time = timer.get_time_in_us();
-                    Base::target_algo_info = algo;
-                }
-            }
-            opr->execution_policy().algo = Base::target_algo_info;
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout);
-            Base::W.update(workspace_size);
-        }
-        if (!Base::target_algo_info.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout);
-            Base::W.update(workspace_size);
-        }
-        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
-                  Base::W.workspace());
-    }
-};
-
-#define DEF_PROF5(c)                                     \
-    template <>                                          \
-    struct OprProxy<c> : public OprProxyProfiling5<c> {  \
-        using OprProxyProfiling5<c>::OprProxyProfiling5; \
-    }
-
-DEF_PROF5(DeformableConvForward);
-DEF_PROF5(DeformableConvBackwardFilter);
-DEF_PROF5(BatchConvBiasForward);
-#undef DEF_PROF5
 
-template <>
-struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> {
-    using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
-    void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 5);
-        if (!Base::W.valid()) {
-            Base::W = WorkspaceWrapper(opr->handle(), 0);
+        TensorLayoutArray layouts;
+        for (auto&& tensor : tensors) {
+            layouts.push_back(tensor.layout);
         }
         if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
             size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(
-                         tensors[0].layout, tensors[1].layout,
-                         tensors[2].layout, tensors[3].layout,
-                         tensors[4].layout)) {
-                opr->execution_policy().algo = algo;
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        tensors[3].layout, tensors[4].layout, nullptr);
-                Base::W.update(workspace_size);
-
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], nullptr, Base::W.workspace());
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                Timer timer;
-                timer.start();
-                for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], nullptr, Base::W.workspace());
-                }
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                timer.stop();
-                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
-                       algo.name.c_str());
-                if (min_time > timer.get_time_in_us()) {
-                    min_time = timer.get_time_in_us();
-                    Base::target_algo_info = algo;
-                }
-            }
-            opr->execution_policy().algo = Base::target_algo_info;
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout, nullptr);
-            Base::W.update(workspace_size);
-        }
-        if (!Base::target_algo_info.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout, nullptr);
-            Base::W.update(workspace_size);
-        }
-        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
-                  nullptr, Base::W.workspace());
-    }
-};
-
-template <>
-struct OprWeightPreprocessProxy<ConvBiasForward>
-        : public OprProxyProfiling5<ConvBiasForward> {
-    using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
-    void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 5);
-        if (!Base::W.valid()) {
-            Base::W = WorkspaceWrapper(opr->handle(), 0);
-        }
-        if (Base::m_profiling && !Base::target_algo_info.valid()) {
-            size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(
-                         tensors[0].layout, tensors[1].layout,
-                         tensors[2].layout, tensors[3].layout,
-                         tensors[4].layout)) {
+            for (auto algo :
+                 AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
                 opr->execution_policy().algo = algo;
 
                 auto preprocess_tensors =
                         weight_prerocess(opr, tensors, algo.desc);
                 megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                ConvBiasForward::PreprocessedFilter preprocessed_filter{
+                typename Opr::PreprocessedFilter preprocessed_filter{
                         nullptr, *preprocess_tensors};
 
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        tensors[3].layout, tensors[4].layout,
-                        &preprocessed_filter);
+                auto workspace_size =
+                        AlgoProxy<Opr, arity>::get_workspace_in_bytes(
+                                opr, layouts, &preprocessed_filter);
                 Base::W.update(workspace_size);
 
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], &preprocessed_filter,
-                              Base::W.workspace());
+                for (size_t times = 0; times < Base::warmup_times; ++times) {
+                    AlgoProxy<Opr, arity>::exec(opr, tensors,
+                                                &preprocessed_filter,
+                                                Base::W.workspace());
+                }
                 megcoreSynchronize(opr->handle()->megcore_computing_handle());
                 Timer timer;
                 timer.start();
                 for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], &preprocessed_filter,
-                              Base::W.workspace());
+                    AlgoProxy<Opr, arity>::exec(opr, tensors,
+                                                &preprocessed_filter,
+                                                Base::W.workspace());
                 }
                 megcoreSynchronize(opr->handle()->megcore_computing_handle());
                 timer.stop();
@@ -565,125 +295,65 @@ struct OprWeightPreprocessProxy<ConvBiasForward>
             auto preprocess_tensors =
                     weight_prerocess(opr, tensors, Base::target_algo_info.desc);
             megcoreSynchronize(opr->handle()->megcore_computing_handle());
-            ConvBiasForward::PreprocessedFilter preprocessed_filter{
+            typename Opr::PreprocessedFilter preprocessed_filter{
                     nullptr, *preprocess_tensors};
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout, &preprocessed_filter);
+            auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
+                    opr, layouts, &preprocessed_filter);
             Base::W.update(workspace_size);
         }
         auto preprocess_tensors =
                 weight_prerocess(opr, tensors, Base::target_algo_info.desc);
         megcoreSynchronize(opr->handle()->megcore_computing_handle());
-        ConvBiasForward::PreprocessedFilter preprocessed_filter{
+        typename Opr::PreprocessedFilter preprocessed_filter{
                 nullptr, *preprocess_tensors};
         if (!Base::target_algo_info.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout, &preprocessed_filter);
+            auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
+                    opr, layouts, &preprocessed_filter);
             Base::W.update(workspace_size);
         }
-        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
-                  &preprocessed_filter, Base::W.workspace());
+        AlgoProxy<Opr, arity>::exec(opr, tensors, &preprocessed_filter,
+                                    Base::W.workspace());
     }
 
     //! handle weight preprocess
     std::shared_ptr<TensorNDArray> weight_prerocess(
-            ConvBiasForward* opr, const TensorNDArray& tensors,
-            const ConvBiasForward::AlgorithmDesc&) {
-        auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout(
-                tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                tensors[3].layout, tensors[4].layout);
+            Opr* opr, const TensorNDArray& tensors,
+            const typename Opr::AlgorithmDesc&) {
+        TensorLayoutArray layouts;
+        for (auto&& tensor : tensors) {
+            layouts.push_back(tensor.layout);
+        }
+        auto weight_perprocess_layouts =
+                AlgoProxy<Opr, arity>::deduce_preprocessed_filter_layout(
+                        opr, layouts);
         auto preprocessed_filter_tensors_ptr =
-                alloc_tensors(opr->handle(), weight_perprocess_layouts);
-        ConvBiasForward::PreprocessedFilter preprocessed_filter{
+                Base::alloc_tensors(opr->handle(), weight_perprocess_layouts);
+        typename Opr::PreprocessedFilter preprocessed_filter{
                 nullptr, *preprocessed_filter_tensors_ptr};
         size_t preprocess_workspace_size =
-                opr->get_preprocess_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        tensors[3].layout, tensors[4].layout);
+                AlgoProxy<Opr, arity>::get_preprocess_workspace_in_bytes(
+                        opr, layouts);
         WorkspaceWrapper preprocess_workspace(opr->handle(),
                                               preprocess_workspace_size);
-        opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2],
-                             tensors[3].layout, tensors[4].layout,
-                             &preprocessed_filter,
-                             preprocess_workspace.workspace());
+        AlgoProxy<Opr, arity>::exec_preprocess(
+                opr, tensors, layouts, &preprocessed_filter,
+                preprocess_workspace.workspace());
         return preprocessed_filter_tensors_ptr;
     }
 };
 
-template <class Opr>
-struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> {
-    using Base = OprProxyProfilingBase<Opr, 8>;
-    using OprProxyProfilingBase<Opr, 8>::OprProxyProfilingBase;
-    void exec(Opr* opr, const TensorNDArray& tensors) {
-        megdnn_assert(tensors.size() == 8);
-        if (!Base::W.valid()) {
-            Base::W = WorkspaceWrapper(opr->handle(), 0);
-        }
-        if (Base::m_profiling && !Base::target_algo_info.valid()) {
-            size_t min_time = std::numeric_limits<size_t>::max();
-            for (auto algo : opr->get_all_algorithms_info(
-                         tensors[0].layout, tensors[1].layout,
-                         tensors[2].layout, tensors[3].layout,
-                         tensors[4].layout, tensors[5].layout,
-                         tensors[6].layout, tensors[7].layout)) {
-                opr->execution_policy().algo = algo;
-                auto workspace_size = opr->get_workspace_in_bytes(
-                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                        tensors[3].layout, tensors[4].layout, tensors[5].layout,
-                        tensors[6].layout, tensors[7].layout);
-                Base::W.update(workspace_size);
-
-                for (size_t times = 0; times < Base::warmup_times; ++times)
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], tensors[5], tensors[6], tensors[7],
-                              Base::W.workspace());
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                Timer timer;
-                timer.start();
-                for (size_t times = 0; times < Base::exec_times; ++times) {
-                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
-                              tensors[4], tensors[5], tensors[6], tensors[7],
-                              Base::W.workspace());
-                }
-                megcoreSynchronize(opr->handle()->megcore_computing_handle());
-                timer.stop();
-                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
-                       algo.name.c_str());
-                if (min_time > timer.get_time_in_us()) {
-                    min_time = timer.get_time_in_us();
-                    Base::target_algo_info = algo;
-                }
-            }
-            opr->execution_policy().algo = Base::target_algo_info;
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout, tensors[5].layout,
-                    tensors[6].layout, tensors[7].layout);
-            Base::W.update(workspace_size);
-        }
-        if (!Base::target_algo_info.valid()) {
-            auto workspace_size = opr->get_workspace_in_bytes(
-                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
-                    tensors[3].layout, tensors[4].layout, tensors[5].layout,
-                    tensors[6].layout, tensors[7].layout);
-            Base::W.update(workspace_size);
-        }
-        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
-                  tensors[5], tensors[6], tensors[7], Base::W.workspace());
-    }
-};
-
-#define DEF_PROF8(c)                                     \
-    template <>                                          \
-    struct OprProxy<c> : public OprProxyProfiling8<c> {  \
-        using OprProxyProfiling8<c>::OprProxyProfiling8; \
+#define DEF_PROF(c, arity)                                    \
+    template <>                                               \
+    struct OprWeightPreprocessProxy<c>                        \
+            : public OprWeightPreprocessProxyImpl<c, arity> { \
+        using OprWeightPreprocessProxyImpl<                   \
+                c, arity>::OprWeightPreprocessProxyImpl;      \
     }
 
-DEF_PROF8(DeformableConvBackwardData);
+DEF_PROF(ConvolutionForward, 3);
+DEF_PROF(ConvBias, 5);
+#undef DEF_PROF
 
-#undef DEF_PROF8
 }  // namespace test
 }  // namespace megdnn