refactor(dnn): refactor opr proxy in test

GitOrigin-RevId: a1d8682e6f
4 years ago · 98a74e4a7b
--- a/dnn/test/common/opr_algo_proxy.h
+++ b/dnn/test/common/opr_algo_proxy.h
@@ -6,7 +6,8 @@
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
 * implied.
 */
 #pragma once

@@ -20,36 +21,126 @@ namespace test {
 template <typename Opr, size_t Arity>
 struct AlgoProxy;

 template <typename Opr>
 struct AlgoProxy<Opr, 3> {
    static std::vector<typename Opr::AlgorithmInfo> get_all_algorithms_info(
            Opr* opr, TensorLayoutArray& layouts) {
        megdnn_assert(layouts.size() == 3);
        return opr->get_all_algorithms_info(layouts[0], layouts[1], layouts[2]);
    }
    static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(
            Opr* opr, TensorLayoutArray& layouts) {
        megdnn_assert(layouts.size() == 3);
        return opr->get_algorithm_info_heuristic(layouts[0], layouts[1],
                                                 layouts[2]);
 #define DEF_ALGO_PROXY(arity)                                                 \
    template <typename Opr>                                                   \
    struct AlgoProxy<Opr, arity> {                                            \
        static std::vector<typename Opr::AlgorithmInfo>                       \
        get_all_algorithms_info(Opr* opr, const TensorLayoutArray& layouts) { \
            megdnn_assert(layouts.size() == arity);                           \
            return opr->get_all_algorithms_info(LAYOUTS);                     \
        }                                                                     \
        static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(      \
                Opr* opr, const TensorLayoutArray& layouts) {                 \
            megdnn_assert(layouts.size() == arity);                           \
            return opr->get_algorithm_info_heuristic(LAYOUTS);                \
        }                                                                     \
        static size_t get_workspace_in_bytes(                                 \
                Opr* opr, const TensorLayoutArray& layouts) {                 \
            megdnn_assert(layouts.size() == arity);                           \
            return opr->get_workspace_in_bytes(LAYOUTS);                      \
        }                                                                     \
        static void exec(Opr* opr, const TensorNDArray& tensors,              \
                         Workspace workspace) {                               \
            megdnn_assert(tensors.size() == arity);                           \
            return opr->exec(TENSORS, workspace);                             \
        }                                                                     \
    }
 };

 template <typename Opr>
 struct AlgoProxy<Opr, 5> {
    static std::vector<typename Opr::AlgorithmInfo> get_all_algorithms_info(
            Opr* opr, TensorLayoutArray& layouts) {
        megdnn_assert(layouts.size() == 5);
        return opr->get_all_algorithms_info(layouts[0], layouts[1], layouts[2],
                                            layouts[3], layouts[4]);
    }
    static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(
            Opr* opr, TensorLayoutArray& layouts) {
        megdnn_assert(layouts.size() == 5);
        return opr->get_algorithm_info_heuristic(
                layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]);
    }
 };
 #define LAYOUTS layouts[0], layouts[1], layouts[2]
 #define TENSORS tensors[0], tensors[1], tensors[2]
 DEF_ALGO_PROXY(3);
 #undef LAYOUTS
 #undef TENSORS

 #define LAYOUTS layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]
 #define TENSORS tensors[0], tensors[1], tensors[2], tensors[3], tensors[4]
 DEF_ALGO_PROXY(5);
 #undef LAYOUTS
 #undef TENSORS

 #define LAYOUTS                                                             \
    layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5], \
            layouts[6], layouts[7]
 #define TENSORS                                                             \
    tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5], \
            tensors[6], tensors[7]
 DEF_ALGO_PROXY(8);
 #undef LAYOUTS
 #undef TENSORS

 #undef DEF_ALGO_PROXY

 #define DEF_ALGO_PROXY(Opr, arity)                                             \
    template <>                                                                \
    struct AlgoProxy<Opr, arity> {                                             \
        static std::vector<typename Opr::AlgorithmInfo>                        \
        get_all_algorithms_info(Opr* opr, const TensorLayoutArray& layouts) {  \
            megdnn_assert(layouts.size() == arity);                            \
            return opr->get_all_algorithms_info(LAYOUTS);                      \
        }                                                                      \
        static typename Opr::AlgorithmInfo get_algorithm_info_heuristic(       \
                Opr* opr, const TensorLayoutArray& layouts) {                  \
            megdnn_assert(layouts.size() == arity);                            \
            return opr->get_algorithm_info_heuristic(LAYOUTS);                 \
        }                                                                      \
        static size_t get_workspace_in_bytes(                                  \
                Opr* opr, const TensorLayoutArray& layouts,                    \
                const typename Opr::PreprocessedFilter* preprocessed_filter =  \
                        nullptr) {                                             \
            megdnn_assert(layouts.size() == arity);                            \
            return opr->get_workspace_in_bytes(LAYOUTS, preprocessed_filter);  \
        }                                                                      \
        static void exec(                                                      \
                Opr* opr, const TensorNDArray& tensors,                        \
                const typename Opr::PreprocessedFilter* preprocessed_filter,   \
                Workspace workspace) {                                         \
            megdnn_assert(tensors.size() == arity);                            \
            return opr->exec(TENSORS, preprocessed_filter, workspace);         \
        }                                                                      \
        static void exec(Opr* opr, const TensorNDArray& tensors,               \
                         Workspace workspace) {                                \
            megdnn_assert(tensors.size() == arity);                            \
            return opr->exec(TENSORS, nullptr, workspace);                     \
        }                                                                      \
        static size_t get_preprocess_workspace_in_bytes(                       \
                Opr* opr, const TensorLayoutArray& layouts) {                  \
            megdnn_assert(layouts.size() == arity);                            \
            return opr->get_preprocess_workspace_in_bytes(LAYOUTS);            \
        }                                                                      \
        static SmallVector<TensorLayout> deduce_preprocessed_filter_layout(    \
                Opr* opr, const TensorLayoutArray& layouts) {                  \
            megdnn_assert(layouts.size() == arity);                            \
            return opr->deduce_preprocessed_filter_layout(LAYOUTS);            \
        }                                                                      \
        static void exec_preprocess(                                           \
                Opr* opr, const TensorNDArray& tensors,                        \
                const TensorLayoutArray& layouts,                              \
                Opr::PreprocessedFilter* preprocessed_filter,                  \
                _megdnn_workspace workspace) {                                 \
            megdnn_assert(layouts.size() == arity && tensors.size() == arity); \
            return opr->exec_preprocess(PREPROCESS_ARGS, preprocessed_filter,  \
                                        workspace);                            \
        }                                                                      \
    };

 #define LAYOUTS layouts[0], layouts[1], layouts[2]
 #define TENSORS tensors[0], tensors[1], tensors[2]
 #define PREPROCESS_ARGS layouts[0], tensors[1], layouts[2]
 DEF_ALGO_PROXY(ConvolutionForward, 3);
 #undef PREPROCESS_ARGS
 #undef LAYOUTS
 #undef TENSORS

 #define LAYOUTS layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]
 #define TENSORS tensors[0], tensors[1], tensors[2], tensors[3], tensors[4]
 #define PREPROCESS_ARGS \
    layouts[0], tensors[1], tensors[2], layouts[3], layouts[4]
 DEF_ALGO_PROXY(ConvBias, 5);
 #undef PREPROCESS_ARGS
 #undef LAYOUTS
 #undef TENSORS

 #undef DEF_ALGO_PROXY

 template <typename Opr, size_t arity = OprTrait<Opr>::arity>
 struct OprAlgoProxyDefaultImpl : public AlgoProxy<Opr, arity> {};
--- a/dnn/test/common/opr_proxy.h
+++ b/dnn/test/common/opr_proxy.h
@@ -14,6 +14,7 @@
 #include "test/common/deduce_layout_proxy.h"
 #include "test/common/exec_proxy.h"
 #include "test/common/inspect_type.h"
 #include "test/common/opr_algo_proxy.h"
 #include "test/common/opr_trait.h"
 #include "test/common/timer.h"
 #include "test/common/workspace_wrapper.h"
@@ -166,104 +167,33 @@ struct OprProxyProfilingBase
        }
        return ret;
    }
 };

 template <class Opr>
 struct OprProxyProfilingTernary : public OprProxyProfilingBase<Opr, 3> {
    using Base = OprProxyProfilingBase<Opr, 3>;
    using OprProxyProfilingBase<Opr, 3>::OprProxyProfilingBase;
    void exec(Opr* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 3);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        megdnn_assert(tensors.size() == arity);
        if (!W.valid()) {
            W = WorkspaceWrapper(opr->handle(), 0);
        }
        if (Base::m_profiling && !Base::target_algo_info.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(tensors[0].layout,
                                                          tensors[1].layout,
                                                          tensors[2].layout)) {
                opr->execution_policy().algo = algo;
                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout,
                        tensors[2].layout);
                Base::W.update(workspace_size);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2],
                              Base::W.workspace());
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2],
                              Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
                       algo.name.c_str());
                if (min_time > timer.get_time_in_us()) {
                    min_time = timer.get_time_in_us();
                    Base::target_algo_info = algo;
                }
            }
            opr->execution_policy().algo = Base::target_algo_info;
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout);
            Base::W.update(workspace_size);
        TensorLayoutArray layouts;
        for (auto&& tensor : tensors) {
            layouts.push_back(tensor.layout);
        }
        if (!Base::target_algo_info.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout);
            Base::W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], Base::W.workspace());
    }
 };

 #define DEF_PROF3(c)                                                 \
    template <>                                                      \
    struct OprProxy<c> : public OprProxyProfilingTernary<c> {        \
        using OprProxyProfilingTernary<c>::OprProxyProfilingTernary; \
    }

 DEF_PROF3(ConvolutionBackwardData);
 DEF_PROF3(ConvolutionBackwardFilter);
 DEF_PROF3(LocalShareForward);
 DEF_PROF3(LocalShareBackwardData);
 DEF_PROF3(LocalShareBackwardFilter);
 #undef DEF_PROF3

 template <>
 struct OprProxy<ConvolutionForward>
        : public OprProxyProfilingTernary<ConvolutionForward> {
    using OprProxyProfilingTernary<
            ConvolutionForward>::OprProxyProfilingTernary;
    void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 3);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        }
        if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
        if (m_profiling && !target_algo_info.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(tensors[0].layout,
                                                          tensors[1].layout,
                                                          tensors[2].layout)) {
            for (auto algo :
                 AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
                opr->execution_policy().algo = algo;
                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        nullptr);
                Base::W.update(workspace_size);
                auto workspace_size =
                        AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr,
                                                                      layouts);
                W.update(workspace_size);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
                              Base::W.workspace());
                for (size_t times = 0; times < warmup_times; ++times)
                    AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
                              Base::W.workspace());
                for (size_t times = 0; times < exec_times; ++times) {
                    AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
@@ -271,286 +201,86 @@ struct OprProxy<ConvolutionForward>
                       algo.name.c_str());
                if (min_time > timer.get_time_in_us()) {
                    min_time = timer.get_time_in_us();
                    Base::target_algo_info = algo;
                    target_algo_info = algo;
                }
            }
            opr->execution_policy().algo = Base::target_algo_info;
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    nullptr);
            Base::W.update(workspace_size);
            opr->execution_policy().algo = target_algo_info;
            auto workspace_size =
                    AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
            W.update(workspace_size);
        }
        if (!Base::target_algo_info.desc.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    nullptr);
            Base::W.update(workspace_size);
        if (!target_algo_info.valid()) {
            auto workspace_size =
                    AlgoProxy<Opr, arity>::get_workspace_in_bytes(opr, layouts);
            W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], nullptr,
                  Base::W.workspace());
        AlgoProxy<Opr, arity>::exec(opr, tensors, W.workspace());
    }
 };

 template <>
 struct OprWeightPreprocessProxy<ConvolutionForward>
        : public OprProxyProfilingTernary<ConvolutionForward> {
    using OprProxyProfilingTernary<
            ConvolutionForward>::OprProxyProfilingTernary;
    void exec(ConvolutionForward* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 3);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        }
        if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(tensors[0].layout,
                                                          tensors[1].layout,
                                                          tensors[2].layout)) {
                opr->execution_policy().algo = algo;

                auto preprocess_tensors =
                        weight_prerocess(opr, tensors, algo.desc);
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                ConvolutionForward::PreprocessedFilter preprocessed_filter{
                        nullptr, *preprocess_tensors};
 #define DEF_PROF(c, arity)                                            \
    template <>                                                       \
    struct OprProxy<c> : public OprProxyProfilingBase<c, arity> {     \
        using OprProxyProfilingBase<c, arity>::OprProxyProfilingBase; \
    }

                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        &preprocessed_filter);
                Base::W.update(workspace_size);
 DEF_PROF(ConvolutionForward, 3);
 DEF_PROF(ConvolutionBackwardData, 3);
 DEF_PROF(ConvolutionBackwardFilter, 3);
 DEF_PROF(LocalShareForward, 3);
 DEF_PROF(LocalShareBackwardData, 3);
 DEF_PROF(LocalShareBackwardFilter, 3);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2],
                              &preprocessed_filter, Base::W.workspace());
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2],
                              &preprocessed_filter, Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
                       algo.name.c_str());
                if (min_time > timer.get_time_in_us()) {
                    min_time = timer.get_time_in_us();
                    Base::target_algo_info = algo;
                }
            }
            opr->execution_policy().algo = Base::target_algo_info;
            auto preprocess_tensors =
                    weight_prerocess(opr, tensors, Base::target_algo_info.desc);
            megcoreSynchronize(opr->handle()->megcore_computing_handle());
            ConvolutionForward::PreprocessedFilter preprocessed_filter{
                    nullptr, *preprocess_tensors};
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    &preprocessed_filter);
            Base::W.update(workspace_size);
        }
        auto preprocess_tensors =
                weight_prerocess(opr, tensors, Base::target_algo_info.desc);
        megcoreSynchronize(opr->handle()->megcore_computing_handle());
        ConvolutionForward::PreprocessedFilter preprocessed_filter{
                nullptr, *preprocess_tensors};
        if (!Base::target_algo_info.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    &preprocessed_filter);
            Base::W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter,
                  Base::W.workspace());
    }
 DEF_PROF(DeformableConvForward, 5);
 DEF_PROF(DeformableConvBackwardFilter, 5);
 DEF_PROF(BatchConvBiasForward, 5);
 DEF_PROF(ConvBiasForward, 5);

    //! handle weight preprocess
    std::shared_ptr<TensorNDArray> weight_prerocess(
            ConvolutionForward* opr, const TensorNDArray& tensors,
            const ConvolutionForward::AlgorithmDesc&) {
        auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout(
                tensors[0].layout, tensors[1].layout, tensors[2].layout);
        auto preprocessed_filter_tensors_ptr =
                alloc_tensors(opr->handle(), weight_perprocess_layouts);
        ConvolutionForward::PreprocessedFilter preprocessed_filter{
                nullptr, *preprocessed_filter_tensors_ptr};
        size_t preprocess_workspace_size =
                opr->get_preprocess_workspace_in_bytes(tensors[0].layout,
                                                       tensors[1].layout,
                                                       tensors[2].layout);
        WorkspaceWrapper preprocess_workspace(opr->handle(),
                                              preprocess_workspace_size);
        opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout,
                             &preprocessed_filter,
                             preprocess_workspace.workspace());
        return preprocessed_filter_tensors_ptr;
    }
 };
 DEF_PROF(DeformableConvBackwardData, 8);
 #undef DEF_PROF

 template <class Opr>
 struct OprProxyProfiling5 : public OprProxyProfilingBase<Opr, 5> {
    using Base = OprProxyProfilingBase<Opr, 5>;
    using OprProxyProfilingBase<Opr, 5>::OprProxyProfilingBase;
 template <class Opr, int arity>
 struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase<Opr, arity> {
    using Base = OprProxyProfilingBase<Opr, arity>;
    void exec(Opr* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 5);
        megdnn_assert(tensors.size() == arity);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        }
        if (Base::m_profiling && !Base::target_algo_info.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(
                         tensors[0].layout, tensors[1].layout,
                         tensors[2].layout, tensors[3].layout,
                         tensors[4].layout)) {
                opr->execution_policy().algo = algo;
                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        tensors[3].layout, tensors[4].layout);
                Base::W.update(workspace_size);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], Base::W.workspace());
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
                       algo.name.c_str());
                if (min_time > timer.get_time_in_us()) {
                    min_time = timer.get_time_in_us();
                    Base::target_algo_info = algo;
                }
            }
            opr->execution_policy().algo = Base::target_algo_info;
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout);
            Base::W.update(workspace_size);
        }
        if (!Base::target_algo_info.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout);
            Base::W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
                  Base::W.workspace());
    }
 };

 #define DEF_PROF5(c)                                     \
    template <>                                          \
    struct OprProxy<c> : public OprProxyProfiling5<c> {  \
        using OprProxyProfiling5<c>::OprProxyProfiling5; \
    }

 DEF_PROF5(DeformableConvForward);
 DEF_PROF5(DeformableConvBackwardFilter);
 DEF_PROF5(BatchConvBiasForward);
 #undef DEF_PROF5

 template <>
 struct OprProxy<ConvBiasForward> : public OprProxyProfiling5<ConvBiasForward> {
    using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
    void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 5);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        TensorLayoutArray layouts;
        for (auto&& tensor : tensors) {
            layouts.push_back(tensor.layout);
        }
        if (Base::m_profiling && !Base::target_algo_info.desc.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(
                         tensors[0].layout, tensors[1].layout,
                         tensors[2].layout, tensors[3].layout,
                         tensors[4].layout)) {
                opr->execution_policy().algo = algo;
                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        tensors[3].layout, tensors[4].layout, nullptr);
                Base::W.update(workspace_size);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], nullptr, Base::W.workspace());
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], nullptr, Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
                       algo.name.c_str());
                if (min_time > timer.get_time_in_us()) {
                    min_time = timer.get_time_in_us();
                    Base::target_algo_info = algo;
                }
            }
            opr->execution_policy().algo = Base::target_algo_info;
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout, nullptr);
            Base::W.update(workspace_size);
        }
        if (!Base::target_algo_info.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout, nullptr);
            Base::W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
                  nullptr, Base::W.workspace());
    }
 };

 template <>
 struct OprWeightPreprocessProxy<ConvBiasForward>
        : public OprProxyProfiling5<ConvBiasForward> {
    using OprProxyProfiling5<ConvBiasForward>::OprProxyProfiling5;
    void exec(ConvBiasForward* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 5);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        }
        if (Base::m_profiling && !Base::target_algo_info.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(
                         tensors[0].layout, tensors[1].layout,
                         tensors[2].layout, tensors[3].layout,
                         tensors[4].layout)) {
            for (auto algo :
                 AlgoProxy<Opr, arity>::get_all_algorithms_info(opr, layouts)) {
                opr->execution_policy().algo = algo;

                auto preprocess_tensors =
                        weight_prerocess(opr, tensors, algo.desc);
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                ConvBiasForward::PreprocessedFilter preprocessed_filter{
                typename Opr::PreprocessedFilter preprocessed_filter{
                        nullptr, *preprocess_tensors};

                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        tensors[3].layout, tensors[4].layout,
                        &preprocessed_filter);
                auto workspace_size =
                        AlgoProxy<Opr, arity>::get_workspace_in_bytes(
                                opr, layouts, &preprocessed_filter);
                Base::W.update(workspace_size);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], &preprocessed_filter,
                              Base::W.workspace());
                for (size_t times = 0; times < Base::warmup_times; ++times) {
                    AlgoProxy<Opr, arity>::exec(opr, tensors,
                                                &preprocessed_filter,
                                                Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], &preprocessed_filter,
                              Base::W.workspace());
                    AlgoProxy<Opr, arity>::exec(opr, tensors,
                                                &preprocessed_filter,
                                                Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
@@ -565,125 +295,65 @@ struct OprWeightPreprocessProxy<ConvBiasForward>
            auto preprocess_tensors =
                    weight_prerocess(opr, tensors, Base::target_algo_info.desc);
            megcoreSynchronize(opr->handle()->megcore_computing_handle());
            ConvBiasForward::PreprocessedFilter preprocessed_filter{
            typename Opr::PreprocessedFilter preprocessed_filter{
                    nullptr, *preprocess_tensors};
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout, &preprocessed_filter);
            auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
                    opr, layouts, &preprocessed_filter);
            Base::W.update(workspace_size);
        }
        auto preprocess_tensors =
                weight_prerocess(opr, tensors, Base::target_algo_info.desc);
        megcoreSynchronize(opr->handle()->megcore_computing_handle());
        ConvBiasForward::PreprocessedFilter preprocessed_filter{
        typename Opr::PreprocessedFilter preprocessed_filter{
                nullptr, *preprocess_tensors};
        if (!Base::target_algo_info.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout, &preprocessed_filter);
            auto workspace_size = AlgoProxy<Opr, arity>::get_workspace_in_bytes(
                    opr, layouts, &preprocessed_filter);
            Base::W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
                  &preprocessed_filter, Base::W.workspace());
        AlgoProxy<Opr, arity>::exec(opr, tensors, &preprocessed_filter,
                                    Base::W.workspace());
    }

    //! handle weight preprocess
    std::shared_ptr<TensorNDArray> weight_prerocess(
            ConvBiasForward* opr, const TensorNDArray& tensors,
            const ConvBiasForward::AlgorithmDesc&) {
        auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout(
                tensors[0].layout, tensors[1].layout, tensors[2].layout,
                tensors[3].layout, tensors[4].layout);
            Opr* opr, const TensorNDArray& tensors,
            const typename Opr::AlgorithmDesc&) {
        TensorLayoutArray layouts;
        for (auto&& tensor : tensors) {
            layouts.push_back(tensor.layout);
        }
        auto weight_perprocess_layouts =
                AlgoProxy<Opr, arity>::deduce_preprocessed_filter_layout(
                        opr, layouts);
        auto preprocessed_filter_tensors_ptr =
                alloc_tensors(opr->handle(), weight_perprocess_layouts);
        ConvBiasForward::PreprocessedFilter preprocessed_filter{
                Base::alloc_tensors(opr->handle(), weight_perprocess_layouts);
        typename Opr::PreprocessedFilter preprocessed_filter{
                nullptr, *preprocessed_filter_tensors_ptr};
        size_t preprocess_workspace_size =
                opr->get_preprocess_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        tensors[3].layout, tensors[4].layout);
                AlgoProxy<Opr, arity>::get_preprocess_workspace_in_bytes(
                        opr, layouts);
        WorkspaceWrapper preprocess_workspace(opr->handle(),
                                              preprocess_workspace_size);
        opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2],
                             tensors[3].layout, tensors[4].layout,
                             &preprocessed_filter,
                             preprocess_workspace.workspace());
        AlgoProxy<Opr, arity>::exec_preprocess(
                opr, tensors, layouts, &preprocessed_filter,
                preprocess_workspace.workspace());
        return preprocessed_filter_tensors_ptr;
    }
 };

 template <class Opr>
 struct OprProxyProfiling8 : public OprProxyProfilingBase<Opr, 8> {
    using Base = OprProxyProfilingBase<Opr, 8>;
    using OprProxyProfilingBase<Opr, 8>::OprProxyProfilingBase;
    void exec(Opr* opr, const TensorNDArray& tensors) {
        megdnn_assert(tensors.size() == 8);
        if (!Base::W.valid()) {
            Base::W = WorkspaceWrapper(opr->handle(), 0);
        }
        if (Base::m_profiling && !Base::target_algo_info.valid()) {
            size_t min_time = std::numeric_limits<size_t>::max();
            for (auto algo : opr->get_all_algorithms_info(
                         tensors[0].layout, tensors[1].layout,
                         tensors[2].layout, tensors[3].layout,
                         tensors[4].layout, tensors[5].layout,
                         tensors[6].layout, tensors[7].layout)) {
                opr->execution_policy().algo = algo;
                auto workspace_size = opr->get_workspace_in_bytes(
                        tensors[0].layout, tensors[1].layout, tensors[2].layout,
                        tensors[3].layout, tensors[4].layout, tensors[5].layout,
                        tensors[6].layout, tensors[7].layout);
                Base::W.update(workspace_size);

                for (size_t times = 0; times < Base::warmup_times; ++times)
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], tensors[5], tensors[6], tensors[7],
                              Base::W.workspace());
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                Timer timer;
                timer.start();
                for (size_t times = 0; times < Base::exec_times; ++times) {
                    opr->exec(tensors[0], tensors[1], tensors[2], tensors[3],
                              tensors[4], tensors[5], tensors[6], tensors[7],
                              Base::W.workspace());
                }
                megcoreSynchronize(opr->handle()->megcore_computing_handle());
                timer.stop();
                printf("%.3fms %s\n", timer.get_time_in_us() / 1e3,
                       algo.name.c_str());
                if (min_time > timer.get_time_in_us()) {
                    min_time = timer.get_time_in_us();
                    Base::target_algo_info = algo;
                }
            }
            opr->execution_policy().algo = Base::target_algo_info;
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout, tensors[5].layout,
                    tensors[6].layout, tensors[7].layout);
            Base::W.update(workspace_size);
        }
        if (!Base::target_algo_info.valid()) {
            auto workspace_size = opr->get_workspace_in_bytes(
                    tensors[0].layout, tensors[1].layout, tensors[2].layout,
                    tensors[3].layout, tensors[4].layout, tensors[5].layout,
                    tensors[6].layout, tensors[7].layout);
            Base::W.update(workspace_size);
        }
        opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4],
                  tensors[5], tensors[6], tensors[7], Base::W.workspace());
    }
 };

 #define DEF_PROF8(c)                                     \
    template <>                                          \
    struct OprProxy<c> : public OprProxyProfiling8<c> {  \
        using OprProxyProfiling8<c>::OprProxyProfiling8; \
 #define DEF_PROF(c, arity)                                    \
    template <>                                               \
    struct OprWeightPreprocessProxy<c>                        \
            : public OprWeightPreprocessProxyImpl<c, arity> { \
        using OprWeightPreprocessProxyImpl<                   \
                c, arity>::OprWeightPreprocessProxyImpl;      \
    }

 DEF_PROF8(DeformableConvBackwardData);
 DEF_PROF(ConvolutionForward, 3);
 DEF_PROF(ConvBias, 5);
 #undef DEF_PROF

 #undef DEF_PROF8
 }  // namespace test
 }  // namespace megdnn