diff --git a/dnn/test/common/opr_algo_proxy.h b/dnn/test/common/opr_algo_proxy.h index fa362756..b2a3d953 100644 --- a/dnn/test/common/opr_algo_proxy.h +++ b/dnn/test/common/opr_algo_proxy.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -20,36 +21,126 @@ namespace test { template struct AlgoProxy; -template -struct AlgoProxy { - static std::vector get_all_algorithms_info( - Opr* opr, TensorLayoutArray& layouts) { - megdnn_assert(layouts.size() == 3); - return opr->get_all_algorithms_info(layouts[0], layouts[1], layouts[2]); - } - static typename Opr::AlgorithmInfo get_algorithm_info_heuristic( - Opr* opr, TensorLayoutArray& layouts) { - megdnn_assert(layouts.size() == 3); - return opr->get_algorithm_info_heuristic(layouts[0], layouts[1], - layouts[2]); +#define DEF_ALGO_PROXY(arity) \ + template \ + struct AlgoProxy { \ + static std::vector \ + get_all_algorithms_info(Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_all_algorithms_info(LAYOUTS); \ + } \ + static typename Opr::AlgorithmInfo get_algorithm_info_heuristic( \ + Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_algorithm_info_heuristic(LAYOUTS); \ + } \ + static size_t get_workspace_in_bytes( \ + Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_workspace_in_bytes(LAYOUTS); \ + } \ + static void exec(Opr* opr, const TensorNDArray& tensors, \ + Workspace workspace) { \ + megdnn_assert(tensors.size() == arity); \ + return opr->exec(TENSORS, workspace); \ + } \ } -}; -template -struct AlgoProxy { - static std::vector get_all_algorithms_info( - Opr* opr, TensorLayoutArray& layouts) { - megdnn_assert(layouts.size() == 5); - return opr->get_all_algorithms_info(layouts[0], layouts[1], layouts[2], - layouts[3], layouts[4]); - } - static typename Opr::AlgorithmInfo get_algorithm_info_heuristic( - Opr* opr, TensorLayoutArray& layouts) { - megdnn_assert(layouts.size() == 5); - return opr->get_algorithm_info_heuristic( - layouts[0], layouts[1], layouts[2], layouts[3], layouts[4]); - } -}; +#define LAYOUTS layouts[0], layouts[1], layouts[2] +#define TENSORS tensors[0], tensors[1], tensors[2] +DEF_ALGO_PROXY(3); +#undef LAYOUTS +#undef TENSORS + +#define LAYOUTS layouts[0], layouts[1], layouts[2], layouts[3], layouts[4] +#define TENSORS tensors[0], tensors[1], tensors[2], tensors[3], tensors[4] +DEF_ALGO_PROXY(5); +#undef LAYOUTS +#undef TENSORS + +#define LAYOUTS \ + layouts[0], layouts[1], layouts[2], layouts[3], layouts[4], layouts[5], \ + layouts[6], layouts[7] +#define TENSORS \ + tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], tensors[5], \ + tensors[6], tensors[7] +DEF_ALGO_PROXY(8); +#undef LAYOUTS +#undef TENSORS + +#undef DEF_ALGO_PROXY + +#define DEF_ALGO_PROXY(Opr, arity) \ + template <> \ + struct AlgoProxy { \ + static std::vector \ + get_all_algorithms_info(Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_all_algorithms_info(LAYOUTS); \ + } \ + static typename Opr::AlgorithmInfo get_algorithm_info_heuristic( \ + Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_algorithm_info_heuristic(LAYOUTS); \ + } \ + static size_t get_workspace_in_bytes( \ + Opr* opr, const TensorLayoutArray& layouts, \ + const typename Opr::PreprocessedFilter* preprocessed_filter = \ + nullptr) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_workspace_in_bytes(LAYOUTS, preprocessed_filter); \ + } \ + static void exec( \ + Opr* opr, const TensorNDArray& tensors, \ + const typename Opr::PreprocessedFilter* preprocessed_filter, \ + Workspace workspace) { \ + megdnn_assert(tensors.size() == arity); \ + return opr->exec(TENSORS, preprocessed_filter, workspace); \ + } \ + static void exec(Opr* opr, const TensorNDArray& tensors, \ + Workspace workspace) { \ + megdnn_assert(tensors.size() == arity); \ + return opr->exec(TENSORS, nullptr, workspace); \ + } \ + static size_t get_preprocess_workspace_in_bytes( \ + Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->get_preprocess_workspace_in_bytes(LAYOUTS); \ + } \ + static SmallVector deduce_preprocessed_filter_layout( \ + Opr* opr, const TensorLayoutArray& layouts) { \ + megdnn_assert(layouts.size() == arity); \ + return opr->deduce_preprocessed_filter_layout(LAYOUTS); \ + } \ + static void exec_preprocess( \ + Opr* opr, const TensorNDArray& tensors, \ + const TensorLayoutArray& layouts, \ + Opr::PreprocessedFilter* preprocessed_filter, \ + _megdnn_workspace workspace) { \ + megdnn_assert(layouts.size() == arity && tensors.size() == arity); \ + return opr->exec_preprocess(PREPROCESS_ARGS, preprocessed_filter, \ + workspace); \ + } \ + }; + +#define LAYOUTS layouts[0], layouts[1], layouts[2] +#define TENSORS tensors[0], tensors[1], tensors[2] +#define PREPROCESS_ARGS layouts[0], tensors[1], layouts[2] +DEF_ALGO_PROXY(ConvolutionForward, 3); +#undef PREPROCESS_ARGS +#undef LAYOUTS +#undef TENSORS + +#define LAYOUTS layouts[0], layouts[1], layouts[2], layouts[3], layouts[4] +#define TENSORS tensors[0], tensors[1], tensors[2], tensors[3], tensors[4] +#define PREPROCESS_ARGS \ + layouts[0], tensors[1], tensors[2], layouts[3], layouts[4] +DEF_ALGO_PROXY(ConvBias, 5); +#undef PREPROCESS_ARGS +#undef LAYOUTS +#undef TENSORS + +#undef DEF_ALGO_PROXY template ::arity> struct OprAlgoProxyDefaultImpl : public AlgoProxy {}; diff --git a/dnn/test/common/opr_proxy.h b/dnn/test/common/opr_proxy.h index 5d587415..a3ef862e 100644 --- a/dnn/test/common/opr_proxy.h +++ b/dnn/test/common/opr_proxy.h @@ -14,6 +14,7 @@ #include "test/common/deduce_layout_proxy.h" #include "test/common/exec_proxy.h" #include "test/common/inspect_type.h" +#include "test/common/opr_algo_proxy.h" #include "test/common/opr_trait.h" #include "test/common/timer.h" #include "test/common/workspace_wrapper.h" @@ -166,104 +167,33 @@ struct OprProxyProfilingBase } return ret; } -}; -template -struct OprProxyProfilingTernary : public OprProxyProfilingBase { - using Base = OprProxyProfilingBase; - using OprProxyProfilingBase::OprProxyProfilingBase; void exec(Opr* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 3); - if (!Base::W.valid()) { - Base::W = WorkspaceWrapper(opr->handle(), 0); + megdnn_assert(tensors.size() == arity); + if (!W.valid()) { + W = WorkspaceWrapper(opr->handle(), 0); } - if (Base::m_profiling && !Base::target_algo_info.valid()) { - size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info(tensors[0].layout, - tensors[1].layout, - tensors[2].layout)) { - opr->execution_policy().algo = algo; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, - tensors[2].layout); - Base::W.update(workspace_size); - - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], - Base::W.workspace()); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - Timer timer; - timer.start(); - for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], - Base::W.workspace()); - } - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - timer.stop(); - printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, - algo.name.c_str()); - if (min_time > timer.get_time_in_us()) { - min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; - } - } - opr->execution_policy().algo = Base::target_algo_info; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout); - Base::W.update(workspace_size); + TensorLayoutArray layouts; + for (auto&& tensor : tensors) { + layouts.push_back(tensor.layout); } - if (!Base::target_algo_info.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout); - Base::W.update(workspace_size); - } - opr->exec(tensors[0], tensors[1], tensors[2], Base::W.workspace()); - } -}; - -#define DEF_PROF3(c) \ - template <> \ - struct OprProxy : public OprProxyProfilingTernary { \ - using OprProxyProfilingTernary::OprProxyProfilingTernary; \ - } - -DEF_PROF3(ConvolutionBackwardData); -DEF_PROF3(ConvolutionBackwardFilter); -DEF_PROF3(LocalShareForward); -DEF_PROF3(LocalShareBackwardData); -DEF_PROF3(LocalShareBackwardFilter); -#undef DEF_PROF3 - -template <> -struct OprProxy - : public OprProxyProfilingTernary { - using OprProxyProfilingTernary< - ConvolutionForward>::OprProxyProfilingTernary; - void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 3); - if (!Base::W.valid()) { - Base::W = WorkspaceWrapper(opr->handle(), 0); - } - if (Base::m_profiling && !Base::target_algo_info.desc.valid()) { + if (m_profiling && !target_algo_info.valid()) { size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info(tensors[0].layout, - tensors[1].layout, - tensors[2].layout)) { + for (auto algo : + AlgoProxy::get_all_algorithms_info(opr, layouts)) { opr->execution_policy().algo = algo; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - nullptr); - Base::W.update(workspace_size); + auto workspace_size = + AlgoProxy::get_workspace_in_bytes(opr, + layouts); + W.update(workspace_size); - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], nullptr, - Base::W.workspace()); + for (size_t times = 0; times < warmup_times; ++times) + AlgoProxy::exec(opr, tensors, W.workspace()); megcoreSynchronize(opr->handle()->megcore_computing_handle()); Timer timer; timer.start(); - for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], nullptr, - Base::W.workspace()); + for (size_t times = 0; times < exec_times; ++times) { + AlgoProxy::exec(opr, tensors, W.workspace()); } megcoreSynchronize(opr->handle()->megcore_computing_handle()); timer.stop(); @@ -271,286 +201,86 @@ struct OprProxy algo.name.c_str()); if (min_time > timer.get_time_in_us()) { min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; + target_algo_info = algo; } } - opr->execution_policy().algo = Base::target_algo_info; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - nullptr); - Base::W.update(workspace_size); + opr->execution_policy().algo = target_algo_info; + auto workspace_size = + AlgoProxy::get_workspace_in_bytes(opr, layouts); + W.update(workspace_size); } - if (!Base::target_algo_info.desc.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - nullptr); - Base::W.update(workspace_size); + if (!target_algo_info.valid()) { + auto workspace_size = + AlgoProxy::get_workspace_in_bytes(opr, layouts); + W.update(workspace_size); } - opr->exec(tensors[0], tensors[1], tensors[2], nullptr, - Base::W.workspace()); + AlgoProxy::exec(opr, tensors, W.workspace()); } }; -template <> -struct OprWeightPreprocessProxy - : public OprProxyProfilingTernary { - using OprProxyProfilingTernary< - ConvolutionForward>::OprProxyProfilingTernary; - void exec(ConvolutionForward* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 3); - if (!Base::W.valid()) { - Base::W = WorkspaceWrapper(opr->handle(), 0); - } - if (Base::m_profiling && !Base::target_algo_info.desc.valid()) { - size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info(tensors[0].layout, - tensors[1].layout, - tensors[2].layout)) { - opr->execution_policy().algo = algo; - - auto preprocess_tensors = - weight_prerocess(opr, tensors, algo.desc); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - ConvolutionForward::PreprocessedFilter preprocessed_filter{ - nullptr, *preprocess_tensors}; +#define DEF_PROF(c, arity) \ + template <> \ + struct OprProxy : public OprProxyProfilingBase { \ + using OprProxyProfilingBase::OprProxyProfilingBase; \ + } - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - &preprocessed_filter); - Base::W.update(workspace_size); +DEF_PROF(ConvolutionForward, 3); +DEF_PROF(ConvolutionBackwardData, 3); +DEF_PROF(ConvolutionBackwardFilter, 3); +DEF_PROF(LocalShareForward, 3); +DEF_PROF(LocalShareBackwardData, 3); +DEF_PROF(LocalShareBackwardFilter, 3); - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], - &preprocessed_filter, Base::W.workspace()); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - Timer timer; - timer.start(); - for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], - &preprocessed_filter, Base::W.workspace()); - } - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - timer.stop(); - printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, - algo.name.c_str()); - if (min_time > timer.get_time_in_us()) { - min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; - } - } - opr->execution_policy().algo = Base::target_algo_info; - auto preprocess_tensors = - weight_prerocess(opr, tensors, Base::target_algo_info.desc); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - ConvolutionForward::PreprocessedFilter preprocessed_filter{ - nullptr, *preprocess_tensors}; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - &preprocessed_filter); - Base::W.update(workspace_size); - } - auto preprocess_tensors = - weight_prerocess(opr, tensors, Base::target_algo_info.desc); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - ConvolutionForward::PreprocessedFilter preprocessed_filter{ - nullptr, *preprocess_tensors}; - if (!Base::target_algo_info.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - &preprocessed_filter); - Base::W.update(workspace_size); - } - opr->exec(tensors[0], tensors[1], tensors[2], &preprocessed_filter, - Base::W.workspace()); - } +DEF_PROF(DeformableConvForward, 5); +DEF_PROF(DeformableConvBackwardFilter, 5); +DEF_PROF(BatchConvBiasForward, 5); +DEF_PROF(ConvBiasForward, 5); - //! handle weight preprocess - std::shared_ptr weight_prerocess( - ConvolutionForward* opr, const TensorNDArray& tensors, - const ConvolutionForward::AlgorithmDesc&) { - auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( - tensors[0].layout, tensors[1].layout, tensors[2].layout); - auto preprocessed_filter_tensors_ptr = - alloc_tensors(opr->handle(), weight_perprocess_layouts); - ConvolutionForward::PreprocessedFilter preprocessed_filter{ - nullptr, *preprocessed_filter_tensors_ptr}; - size_t preprocess_workspace_size = - opr->get_preprocess_workspace_in_bytes(tensors[0].layout, - tensors[1].layout, - tensors[2].layout); - WorkspaceWrapper preprocess_workspace(opr->handle(), - preprocess_workspace_size); - opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2].layout, - &preprocessed_filter, - preprocess_workspace.workspace()); - return preprocessed_filter_tensors_ptr; - } -}; +DEF_PROF(DeformableConvBackwardData, 8); +#undef DEF_PROF -template -struct OprProxyProfiling5 : public OprProxyProfilingBase { - using Base = OprProxyProfilingBase; - using OprProxyProfilingBase::OprProxyProfilingBase; +template +struct OprWeightPreprocessProxyImpl : public OprProxyProfilingBase { + using Base = OprProxyProfilingBase; void exec(Opr* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 5); + megdnn_assert(tensors.size() == arity); if (!Base::W.valid()) { Base::W = WorkspaceWrapper(opr->handle(), 0); } - if (Base::m_profiling && !Base::target_algo_info.valid()) { - size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info( - tensors[0].layout, tensors[1].layout, - tensors[2].layout, tensors[3].layout, - tensors[4].layout)) { - opr->execution_policy().algo = algo; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout); - Base::W.update(workspace_size); - - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], Base::W.workspace()); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - Timer timer; - timer.start(); - for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], Base::W.workspace()); - } - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - timer.stop(); - printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, - algo.name.c_str()); - if (min_time > timer.get_time_in_us()) { - min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; - } - } - opr->execution_policy().algo = Base::target_algo_info; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout); - Base::W.update(workspace_size); - } - if (!Base::target_algo_info.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout); - Base::W.update(workspace_size); - } - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], - Base::W.workspace()); - } -}; - -#define DEF_PROF5(c) \ - template <> \ - struct OprProxy : public OprProxyProfiling5 { \ - using OprProxyProfiling5::OprProxyProfiling5; \ - } - -DEF_PROF5(DeformableConvForward); -DEF_PROF5(DeformableConvBackwardFilter); -DEF_PROF5(BatchConvBiasForward); -#undef DEF_PROF5 -template <> -struct OprProxy : public OprProxyProfiling5 { - using OprProxyProfiling5::OprProxyProfiling5; - void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 5); - if (!Base::W.valid()) { - Base::W = WorkspaceWrapper(opr->handle(), 0); + TensorLayoutArray layouts; + for (auto&& tensor : tensors) { + layouts.push_back(tensor.layout); } if (Base::m_profiling && !Base::target_algo_info.desc.valid()) { size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info( - tensors[0].layout, tensors[1].layout, - tensors[2].layout, tensors[3].layout, - tensors[4].layout)) { - opr->execution_policy().algo = algo; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, nullptr); - Base::W.update(workspace_size); - - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], nullptr, Base::W.workspace()); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - Timer timer; - timer.start(); - for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], nullptr, Base::W.workspace()); - } - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - timer.stop(); - printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, - algo.name.c_str()); - if (min_time > timer.get_time_in_us()) { - min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; - } - } - opr->execution_policy().algo = Base::target_algo_info; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, nullptr); - Base::W.update(workspace_size); - } - if (!Base::target_algo_info.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, nullptr); - Base::W.update(workspace_size); - } - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], - nullptr, Base::W.workspace()); - } -}; - -template <> -struct OprWeightPreprocessProxy - : public OprProxyProfiling5 { - using OprProxyProfiling5::OprProxyProfiling5; - void exec(ConvBiasForward* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 5); - if (!Base::W.valid()) { - Base::W = WorkspaceWrapper(opr->handle(), 0); - } - if (Base::m_profiling && !Base::target_algo_info.valid()) { - size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info( - tensors[0].layout, tensors[1].layout, - tensors[2].layout, tensors[3].layout, - tensors[4].layout)) { + for (auto algo : + AlgoProxy::get_all_algorithms_info(opr, layouts)) { opr->execution_policy().algo = algo; auto preprocess_tensors = weight_prerocess(opr, tensors, algo.desc); megcoreSynchronize(opr->handle()->megcore_computing_handle()); - ConvBiasForward::PreprocessedFilter preprocessed_filter{ + typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, - &preprocessed_filter); + auto workspace_size = + AlgoProxy::get_workspace_in_bytes( + opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], &preprocessed_filter, - Base::W.workspace()); + for (size_t times = 0; times < Base::warmup_times; ++times) { + AlgoProxy::exec(opr, tensors, + &preprocessed_filter, + Base::W.workspace()); + } megcoreSynchronize(opr->handle()->megcore_computing_handle()); Timer timer; timer.start(); for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], &preprocessed_filter, - Base::W.workspace()); + AlgoProxy::exec(opr, tensors, + &preprocessed_filter, + Base::W.workspace()); } megcoreSynchronize(opr->handle()->megcore_computing_handle()); timer.stop(); @@ -565,125 +295,65 @@ struct OprWeightPreprocessProxy auto preprocess_tensors = weight_prerocess(opr, tensors, Base::target_algo_info.desc); megcoreSynchronize(opr->handle()->megcore_computing_handle()); - ConvBiasForward::PreprocessedFilter preprocessed_filter{ + typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, &preprocessed_filter); + auto workspace_size = AlgoProxy::get_workspace_in_bytes( + opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); } auto preprocess_tensors = weight_prerocess(opr, tensors, Base::target_algo_info.desc); megcoreSynchronize(opr->handle()->megcore_computing_handle()); - ConvBiasForward::PreprocessedFilter preprocessed_filter{ + typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocess_tensors}; if (!Base::target_algo_info.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, &preprocessed_filter); + auto workspace_size = AlgoProxy::get_workspace_in_bytes( + opr, layouts, &preprocessed_filter); Base::W.update(workspace_size); } - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], - &preprocessed_filter, Base::W.workspace()); + AlgoProxy::exec(opr, tensors, &preprocessed_filter, + Base::W.workspace()); } //! handle weight preprocess std::shared_ptr weight_prerocess( - ConvBiasForward* opr, const TensorNDArray& tensors, - const ConvBiasForward::AlgorithmDesc&) { - auto weight_perprocess_layouts = opr->deduce_preprocessed_filter_layout( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout); + Opr* opr, const TensorNDArray& tensors, + const typename Opr::AlgorithmDesc&) { + TensorLayoutArray layouts; + for (auto&& tensor : tensors) { + layouts.push_back(tensor.layout); + } + auto weight_perprocess_layouts = + AlgoProxy::deduce_preprocessed_filter_layout( + opr, layouts); auto preprocessed_filter_tensors_ptr = - alloc_tensors(opr->handle(), weight_perprocess_layouts); - ConvBiasForward::PreprocessedFilter preprocessed_filter{ + Base::alloc_tensors(opr->handle(), weight_perprocess_layouts); + typename Opr::PreprocessedFilter preprocessed_filter{ nullptr, *preprocessed_filter_tensors_ptr}; size_t preprocess_workspace_size = - opr->get_preprocess_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout); + AlgoProxy::get_preprocess_workspace_in_bytes( + opr, layouts); WorkspaceWrapper preprocess_workspace(opr->handle(), preprocess_workspace_size); - opr->exec_preprocess(tensors[0].layout, tensors[1], tensors[2], - tensors[3].layout, tensors[4].layout, - &preprocessed_filter, - preprocess_workspace.workspace()); + AlgoProxy::exec_preprocess( + opr, tensors, layouts, &preprocessed_filter, + preprocess_workspace.workspace()); return preprocessed_filter_tensors_ptr; } }; -template -struct OprProxyProfiling8 : public OprProxyProfilingBase { - using Base = OprProxyProfilingBase; - using OprProxyProfilingBase::OprProxyProfilingBase; - void exec(Opr* opr, const TensorNDArray& tensors) { - megdnn_assert(tensors.size() == 8); - if (!Base::W.valid()) { - Base::W = WorkspaceWrapper(opr->handle(), 0); - } - if (Base::m_profiling && !Base::target_algo_info.valid()) { - size_t min_time = std::numeric_limits::max(); - for (auto algo : opr->get_all_algorithms_info( - tensors[0].layout, tensors[1].layout, - tensors[2].layout, tensors[3].layout, - tensors[4].layout, tensors[5].layout, - tensors[6].layout, tensors[7].layout)) { - opr->execution_policy().algo = algo; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, tensors[5].layout, - tensors[6].layout, tensors[7].layout); - Base::W.update(workspace_size); - - for (size_t times = 0; times < Base::warmup_times; ++times) - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], tensors[5], tensors[6], tensors[7], - Base::W.workspace()); - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - Timer timer; - timer.start(); - for (size_t times = 0; times < Base::exec_times; ++times) { - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], - tensors[4], tensors[5], tensors[6], tensors[7], - Base::W.workspace()); - } - megcoreSynchronize(opr->handle()->megcore_computing_handle()); - timer.stop(); - printf("%.3fms %s\n", timer.get_time_in_us() / 1e3, - algo.name.c_str()); - if (min_time > timer.get_time_in_us()) { - min_time = timer.get_time_in_us(); - Base::target_algo_info = algo; - } - } - opr->execution_policy().algo = Base::target_algo_info; - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, tensors[5].layout, - tensors[6].layout, tensors[7].layout); - Base::W.update(workspace_size); - } - if (!Base::target_algo_info.valid()) { - auto workspace_size = opr->get_workspace_in_bytes( - tensors[0].layout, tensors[1].layout, tensors[2].layout, - tensors[3].layout, tensors[4].layout, tensors[5].layout, - tensors[6].layout, tensors[7].layout); - Base::W.update(workspace_size); - } - opr->exec(tensors[0], tensors[1], tensors[2], tensors[3], tensors[4], - tensors[5], tensors[6], tensors[7], Base::W.workspace()); - } -}; - -#define DEF_PROF8(c) \ - template <> \ - struct OprProxy : public OprProxyProfiling8 { \ - using OprProxyProfiling8::OprProxyProfiling8; \ +#define DEF_PROF(c, arity) \ + template <> \ + struct OprWeightPreprocessProxy \ + : public OprWeightPreprocessProxyImpl { \ + using OprWeightPreprocessProxyImpl< \ + c, arity>::OprWeightPreprocessProxyImpl; \ } -DEF_PROF8(DeformableConvBackwardData); +DEF_PROF(ConvolutionForward, 3); +DEF_PROF(ConvBias, 5); +#undef DEF_PROF -#undef DEF_PROF8 } // namespace test } // namespace megdnn