diff --git a/dnn/include/megdnn/heuristic_cache.h b/dnn/include/megdnn/heuristic_cache.h new file mode 100644 index 00000000..1298ba85 --- /dev/null +++ b/dnn/include/megdnn/heuristic_cache.h @@ -0,0 +1,92 @@ +/** + * \file dnn/include/megdnn/heuristic_cache.h + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#pragma once + +#include "megdnn/basic_types.h" +#include "megdnn/oprs/base.h" + +#include +#include +#include + +namespace megdnn { + +class HeuristicCache { +private: + HeuristicCache() = default; + +public: + static HeuristicCache& instance(); + + struct KeyStorage { + std::string category; + std::string input; + + bool operator==(const KeyStorage& k) const { + return category == k.category && input == k.input; + } + }; + + class Key { + Handle* m_handle; + uint32_t m_opr_type; + const TensorLayout* m_inp_layouts_ptr; + size_t m_inp_layouts_size; + const void* m_param_ptr; + size_t m_param_size; + + mutable std::string m_category; + mutable std::string m_input; + + public: + Key(Handle* opr_handle, Algorithm::OprType opr_type, const TensorLayout* inp_layouts_ptr, + size_t inp_layouts_size, const void* param_ptr = nullptr, size_t param_size = 0) + : m_handle{opr_handle}, + m_opr_type{static_cast(opr_type)}, + m_inp_layouts_ptr{inp_layouts_ptr}, + m_inp_layouts_size{inp_layouts_size}, + m_param_ptr{param_ptr}, + m_param_size{param_size} {} + + KeyStorage build_key_storage() const; + }; + + struct Result { + ExecutionPolicy policy; + size_t workspace; + }; + + void put(const Key& key, Result& result); + + Result get(const Key& key); + + void clear(); + +private: + struct Hash { + size_t operator()(const KeyStorage& k) const { + size_t h1 = std::hash{}(k.category); + size_t h2 = std::hash{}(k.input); + h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2); + return h1; + } + }; + std::unordered_map m_heuristic_cache; +#if __DEPLOY_ON_XP_SP2__ + size_t m_mtx; +#else + std::mutex m_mtx; +#endif +}; + +} // namespace megdnn diff --git a/dnn/include/megdnn/oprs/linalg.h b/dnn/include/megdnn/oprs/linalg.h index 135200a1..baf69d3b 100644 --- a/dnn/include/megdnn/oprs/linalg.h +++ b/dnn/include/megdnn/oprs/linalg.h @@ -42,6 +42,10 @@ public: const TensorLayout& B, const TensorLayout& C) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD; + } + protected: void check_exec(const TensorLayout& A, const TensorLayout& B, const TensorLayout& C, size_t workspace_in_bytes); @@ -76,6 +80,11 @@ public: const TensorLayout& C) = 0; static size_t pack_size (const Param::Format format); + + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::MATRIX_MUL_FORWARD; + } + protected: void check_exec(const TensorLayout& A, const TensorLayout& B, const TensorLayout& C, size_t workspace_in_bytes); diff --git a/dnn/include/megdnn/oprs/nn.h b/dnn/include/megdnn/oprs/nn.h index 56b10206..31ce6912 100644 --- a/dnn/include/megdnn/oprs/nn.h +++ b/dnn/include/megdnn/oprs/nn.h @@ -275,6 +275,10 @@ public: const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVOLUTION_FORWARD; + } + protected: CanonizedFilterMeta check_exec( const TensorLayout& src, const TensorLayout& filter, @@ -309,6 +313,10 @@ public: void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVOLUTION_BACKWARD_DATA; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& filter, const TensorLayout& diff, @@ -338,6 +346,10 @@ public: const TensorLayout& diff, const TensorLayout& grad) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVOLUTION_BACKWARD_FILTER; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& src, const TensorLayout& diff, @@ -505,6 +517,10 @@ public: const ConvBiasForward::BiasMode bias_mode, const param::ConvBias::NonlineMode nonline_mode); + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVBIAS_FORWARD; + } + protected: CanonizedFilterMeta check_exec( const TensorLayout& src, const TensorLayout& filter, @@ -775,6 +791,10 @@ public: virtual size_t get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::POOLING_FORWARD; + } + protected: void check_exec(const TensorLayout& src, const TensorLayout& dst, size_t workspace_in_bytes); @@ -801,6 +821,10 @@ public: const TensorLayout& diff, const TensorLayout& grad) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::POOLING_BACKWARD; + } + protected: void check_exec(const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, const TensorLayout& grad, @@ -1216,6 +1240,10 @@ public: const TensorLayout& filter, const TensorLayout& dst) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVOLUTION3D_FORWARD; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& src, const TensorLayout& filter, @@ -1244,6 +1272,10 @@ public: void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVOLUTION3D_BACKWARD_DATA; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& filter, const TensorLayout& diff, @@ -1268,6 +1300,10 @@ public: const TensorLayout& diff, const TensorLayout& grad) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::CONVOLUTION3D_BACKWARD_FILTER; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& src, const TensorLayout& diff, @@ -1308,6 +1344,10 @@ public: const TensorLayout& filter, const TensorLayout& dst) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::LOCAL_SHARE_FORWARD; + } + protected: void check_exec(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, size_t workspace_in_bytes); @@ -1334,6 +1374,10 @@ public: void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, TensorLayout& grad); + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::LOCAL_SHARE_BACKWARD_DATA; + } + protected: void check_exec(const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad, size_t workspace_in_bytes); @@ -1358,6 +1402,10 @@ public: const TensorLayout& diff, const TensorLayout& grad) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::LOCAL_SHARE_BACKWARD_FILTER; + } + protected: void check_exec(const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad, size_t workspace_in_bytes); @@ -1479,6 +1527,10 @@ public: const TensorLayout& mask, const TensorLayout& dst) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::DEFORMABLE_CONV_FORWARD; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& im, const TensorLayout& filter, @@ -1520,6 +1572,10 @@ public: const TensorLayout& mask, const TensorLayout& out_grad, TensorLayout& filter_grad); + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::DEFORMABLE_CONV_BACKWARD_FILTER; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& im, const TensorLayout& offset, @@ -1566,6 +1622,10 @@ public: const TensorLayout& out_grad, TensorLayout& im_grad, TensorLayout& offset_grad, TensorLayout& mask_grad); + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::DEFORMABLE_CONV_BACKWARD_DATA; + } + protected: CanonizedFilterMeta check_exec( const TensorLayout& im, const TensorLayout& filter, @@ -1677,6 +1737,10 @@ public: const TensorLayout& z, const TensorLayout& dst) = 0; + static Algorithm::OprType get_opr_type() { + return Algorithm::OprType::BATCH_CONV_FORWARD; + } + protected: CanonizedFilterMeta check_exec(const TensorLayout& src, const TensorLayout& filter, diff --git a/dnn/src/arm_common/pooling/opr_impl.cpp b/dnn/src/arm_common/pooling/opr_impl.cpp index 43edb7ac..6e5b62f8 100644 --- a/dnn/src/arm_common/pooling/opr_impl.cpp +++ b/dnn/src/arm_common/pooling/opr_impl.cpp @@ -101,6 +101,15 @@ PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst) { + TensorLayoutArray layouts{src, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + auto param = make_pooling_kern_szie_param(this, src, dst); auto algo = get_algorithm(this, src, dst); if (!is_fallback_algo(algo)) { diff --git a/dnn/src/common/algo_chooser.h b/dnn/src/common/algo_chooser.h index bc5dda09..9c0964d5 100644 --- a/dnn/src/common/algo_chooser.h +++ b/dnn/src/common/algo_chooser.h @@ -6,7 +6,8 @@ * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. */ #pragma once @@ -17,10 +18,28 @@ #include #include "megdnn/common.h" +#include "megdnn/heuristic_cache.h" #include "utils.h" namespace megdnn { +template +size_t get_dnn_workspace(Opr* opr, Args&&... args) { + TensorLayoutArray layouts{{args...}}; + HeuristicCache::Key key{opr->handle(), opr->get_opr_type(), + layouts.data(), layouts.size(), &opr->param(), + sizeof(opr->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + + typename Opr::AlgoBase::SizeArgs size_args(opr, + std::forward(args)...); + return get_algorithm(opr, std::forward(args)...) + ->get_workspace_in_bytes(size_args); +} + /*! * \brief get user-configured algorithm, or heuristic algorithm */ @@ -31,9 +50,20 @@ typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { if (set.valid()) { ret = set; } else { - ret = opr->get_algorithm_info_heuristic( - std::forward(args)..., std::numeric_limits::max(), - AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT).desc; + TensorLayoutArray layouts{{args...}}; + HeuristicCache::Key key{opr->handle(), opr->get_opr_type(), + layouts.data(), layouts.size(), &opr->param(), + sizeof(opr->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + ret = rst.policy.algo; + } else { + ret = opr->get_algorithm_info_heuristic( + std::forward(args)..., + std::numeric_limits::max(), + AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT) + .desc; + } } return static_cast( opr->get_algorithm_from_desc(ret)); diff --git a/dnn/src/common/deformable_conv.cpp b/dnn/src/common/deformable_conv.cpp index 5328bda0..18db856a 100644 --- a/dnn/src/common/deformable_conv.cpp +++ b/dnn/src/common/deformable_conv.cpp @@ -250,13 +250,9 @@ CanonizedFilterMeta DeformableConvBackwardData::check_exec( megdnn_assert_eq_dtype(im, mask_grad); // check layout - megdnn_assert(im.shape == im_grad.shape, "invalid im_grad shape: %s", - megdnn_layout_msg(im_grad).c_str()); - megdnn_assert(offset.shape == offset_grad.shape, - "invalid offset_grad shape: %s", - megdnn_layout_msg(offset_grad).c_str()); - megdnn_assert(mask.shape == mask_grad.shape, "invalid mask_grad shape: %s", - megdnn_layout_msg(mask_grad).c_str()); + megdnn_assert_eq_shape(im, im_grad); + megdnn_assert_eq_shape(offset, offset_grad); + megdnn_assert_eq_shape(mask, mask_grad); auto ret = make_canonized_filter_meta(im.ndim, filter, offset); auto required_workspace_in_bytes = diff --git a/dnn/src/common/heuristic_cache.cpp b/dnn/src/common/heuristic_cache.cpp new file mode 100644 index 00000000..8ca1a593 --- /dev/null +++ b/dnn/src/common/heuristic_cache.cpp @@ -0,0 +1,142 @@ +/** + * \file dnn/src/common/heuristic_cache.cpp + * MegEngine is Licensed under the Apache License, Version 2.0 (the "License") + * + * Copyright (c) 2014-2021 Megvii Inc. All rights reserved. + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or + * implied. + */ + +#include "megdnn/heuristic_cache.h" +#include "src/common/utils.h" +#include "src/naive/handle.h" + +#if MEGDNN_WITH_CUDA +#include "src/cuda/utils.h" +#endif + +#if MEGDNN_WITH_ROCM +#include "hcc_detail/hcc_defs_prologue.h" +#include "megcore_rocm.h" +#include "src/rocm/utils.h" +#endif + +using namespace megdnn; + +HeuristicCache& HeuristicCache::instance() { + static HeuristicCache ins; + return ins; +} + +HeuristicCache::KeyStorage HeuristicCache::Key::build_key_storage() const { + auto&& ctg = m_category; + auto&& inp = m_input; + + if (!m_category.empty() && !m_input.empty()) + return {ctg, inp}; + + inp.reserve(sizeof(TensorLayout) * 3 * m_inp_layouts_size + m_param_size); + for (size_t i = 0; i < m_inp_layouts_size; i++) { + auto&& ly = m_inp_layouts_ptr[i]; + for (size_t j = 0; j < ly.ndim; j++) { + if (j) + inp.push_back(','); + inp.append(std::to_string(ly.shape[j])); + } + inp.push_back(';'); + for (size_t j = 0; j < ly.ndim; j++) { + if (j) + inp.push_back(','); + inp.append(std::to_string(ly.stride[j])); + } + inp.push_back(';'); + inp.append(ly.dtype.name()); + inp.push_back(';'); + inp.append(ly.format.to_string().c_str()); + inp.push_back('|'); + } + if (m_param_size) { + inp.append(reinterpret_cast(m_param_ptr), m_param_size); + } + + ctg = "plat:"; + ctg.append(std::to_string(static_cast(m_handle->type()))); + switch (m_handle->type()) { +#if MEGDNN_WITH_CUDA + case Handle::HandleType::CUDA: { + int cuda_rt = -1; + cuda_check(cudaRuntimeGetVersion(&cuda_rt)); + cuda_rt /= 1000; + auto&& handle = static_cast(m_handle); + auto&& prop = handle->device_prop(); + ctg.append(ssprintf(";dev=%s;cap=%d.%d;runtime=%d;", + prop.name, prop.major, prop.minor, cuda_rt)); + break; + } +#endif +#if MEGDNN_WITH_ROCM + case Handle::HandleType::ROCM: { + auto&& handle = static_cast(m_handle); + auto&& prop = handle->device_prop(); + int drv = -1, hip_rt = -1; + hip_check(hipDriverGetVersion(&drv)); + hip_check(hipRuntimeGetVersion(&hip_rt)); + ctg.append(ssprintf(";dev=%s;cap=%d.%d,drv=%d;runtime=%d;", + prop.name, prop.major, prop.minor, drv, hip_rt)); + break; + } +#endif + case Handle::HandleType::FALLBACK: +#if MEGDNN_X86 + case Handle::HandleType::X86: +#endif +#if MEGDNN_AARCH64 || MEGDNN_ARMV7 + case Handle::HandleType::ARM_COMMON: +#endif +#if MEGDNN_AARCH64 + case Handle::HandleType::AARCH64: +#endif +#if MEGDNN_ARMV7 + case Handle::HandleType::ARMV7: +#endif + { + size_t nr_threads = + static_cast(m_handle) + ->megcore_dispatcher() + ->nr_threads(); + ctg.append(";"); + ctg.append(std::to_string(nr_threads)); + ctg.append(";"); + break; + } + default: + ctg.append(";"); + } + ctg.append(std::to_string(m_opr_type)); + return {ctg, inp}; +} + +void HeuristicCache::put(const Key& key, Result& result) { + MEGDNN_LOCK_GUARD(m_mtx); + if (result.policy.algo.valid()) + m_heuristic_cache[key.build_key_storage()] = result; +} + +HeuristicCache::Result HeuristicCache::get(const Key& key) { + MEGDNN_LOCK_GUARD(m_mtx); + KeyStorage ks = key.build_key_storage(); + auto iter = m_heuristic_cache.find(ks); + if (iter == m_heuristic_cache.end()) { + return {}; + } else { + return iter->second; + } +} + +void HeuristicCache::clear() { + MEGDNN_LOCK_GUARD(m_mtx); + m_heuristic_cache.clear(); +} \ No newline at end of file diff --git a/dnn/src/cuda/batch_conv_bias/opr_impl.cpp b/dnn/src/cuda/batch_conv_bias/opr_impl.cpp index b6842fbb..9d3b3711 100644 --- a/dnn/src/cuda/batch_conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/batch_conv_bias/opr_impl.cpp @@ -56,9 +56,7 @@ size_t BatchConvBiasForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { - AlgoBase::SizeArgs args(this, src, filter, bias, z, dst); - return get_algorithm(this, src, filter, bias, z, dst) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, filter, bias, z, dst); } void BatchConvBiasForwardImpl::exec(_megdnn_tensor_in src, @@ -66,10 +64,12 @@ void BatchConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in bias, _megdnn_tensor_in z, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, + workspace.size); AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } const char* BatchConvBiasForwardImpl::get_algorithm_set_name() const { diff --git a/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp b/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp index 1cf80527..7bc03766 100644 --- a/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp +++ b/dnn/src/cuda/batched_matrix_mul/opr_impl.cpp @@ -33,13 +33,12 @@ void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, AlgoBase::ExecArgs args(this, A, B, C, workspace); check_exec(A.layout, B.layout, C.layout, workspace.size); auto&& algo = megdnn::get_algorithm(this, A.layout, B.layout, C.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { - AlgoBase::SizeArgs args(this, A, B, C); - return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, A, B, C); } std::vector BatchedMatrixMulForwardImpl::get_all_algorithms( diff --git a/dnn/src/cuda/conv_bias/opr_impl.cpp b/dnn/src/cuda/conv_bias/opr_impl.cpp index b11a0539..2a00f1f6 100644 --- a/dnn/src/cuda/conv_bias/opr_impl.cpp +++ b/dnn/src/cuda/conv_bias/opr_impl.cpp @@ -36,7 +36,7 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, preprocessed_filter); auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, z.layout, dst.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); }; std::vector @@ -228,6 +228,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes( const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { + TensorLayoutArray layouts{src, filter, bias, z, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + AlgoBase::SizeArgs args{ this, src, filter, bias, z, dst, preprocessed_filter}; return get_algorithm(this, src, filter, bias, z, dst) diff --git a/dnn/src/cuda/convolution/opr_impl.cpp b/dnn/src/cuda/convolution/opr_impl.cpp index c3fbf76b..560da586 100644 --- a/dnn/src/cuda/convolution/opr_impl.cpp +++ b/dnn/src/cuda/convolution/opr_impl.cpp @@ -58,9 +58,7 @@ size_t ConvolutionForwardImpl::get_workspace_in_bytes( const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { MEGDNN_MARK_USED_VAR(preprocessed_filter); - AlgoBase::SizeArgs args{this, src, filter, dst}; - return megdnn::get_algorithm(this, src, filter, dst) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, filter, dst); } void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, @@ -72,7 +70,7 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, preprocessed_filter); AlgoBase::ExecArgs args(this, src, filter, dst, workspace); auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } const char* ConvolutionForwardImpl::get_algorithm_set_name() const { @@ -85,9 +83,10 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { + check_exec(filter.layout, diff.layout, grad.layout, workspace.size); AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } std::vector @@ -196,9 +195,7 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, filter, diff, grad) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, filter, diff, grad); } const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { @@ -211,9 +208,10 @@ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { + check_exec(src.layout, diff.layout, grad.layout, workspace.size); AlgoBase::ExecArgs args(this, src, diff, grad, workspace); auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } std::vector @@ -324,9 +322,7 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, grad) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, diff, grad); } const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const { diff --git a/dnn/src/cuda/convolution3d/opr_impl.cpp b/dnn/src/cuda/convolution3d/opr_impl.cpp index 8dec7748..607edf55 100644 --- a/dnn/src/cuda/convolution3d/opr_impl.cpp +++ b/dnn/src/cuda/convolution3d/opr_impl.cpp @@ -111,18 +111,17 @@ Convolution3DForwardImpl::get_all_algorithms(const TensorLayout& src, size_t Convolution3DForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) { - AlgoBase::SizeArgs args(this, src, filter, dst); - return get_algorithm(this, src, filter, dst) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, filter, dst); } void Convolution3DForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, filter.layout, dst.layout, workspace.size); AlgoBase::ExecArgs args(this, src, filter, dst, workspace); auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } const char* Convolution3DForwardImpl::get_algorithm_set_name() const { @@ -133,9 +132,10 @@ void Convolution3DBackwardDataImpl::exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { + check_exec(filter.layout, diff.layout, grad.layout, workspace.size); AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } std::vector @@ -200,9 +200,7 @@ Convolution3DBackwardDataImpl::get_algorithm_heuristic( size_t Convolution3DBackwardDataImpl::get_workspace_in_bytes( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, filter, diff, grad) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, filter, diff, grad); } const char* Convolution3DBackwardDataImpl::get_algorithm_set_name() const { @@ -213,10 +211,11 @@ void Convolution3DBackwardFilterImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { + check_exec(src.layout, diff.layout, grad.layout, workspace.size); AlgoBase::ExecArgs args(this, src, diff, grad, workspace); auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } std::vector @@ -281,9 +280,7 @@ Convolution3DBackwardFilterImpl::get_algorithm_heuristic( size_t Convolution3DBackwardFilterImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, grad) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, diff , grad); } const char* Convolution3DBackwardFilterImpl::get_algorithm_set_name() const { diff --git a/dnn/src/cuda/deformable_conv/opr_impl.cpp b/dnn/src/cuda/deformable_conv/opr_impl.cpp index dc5e643b..167d66dc 100644 --- a/dnn/src/cuda/deformable_conv/opr_impl.cpp +++ b/dnn/src/cuda/deformable_conv/opr_impl.cpp @@ -36,8 +36,7 @@ size_t Fwd::get_workspace_in_bytes(const TensorLayout& im, const TensorLayout& offset, const TensorLayout& mask, const TensorLayout& dst) { - auto algo = get_algorithm(this, im, filter, offset, mask, dst); - return algo->get_workspace_in_bytes({this, im, filter, offset, mask, dst}); + return get_dnn_workspace(this, im, filter, offset, mask, dst); } std::vector Fwd::get_all_algorithms(const TensorLayout& /* im */, @@ -96,13 +95,13 @@ const char* Fwd::get_algorithm_set_name() const { void Fwd::exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, _megdnn_tensor_in offset, _megdnn_tensor_in mask, _megdnn_tensor_out out, _megdnn_workspace workspace) { + check_exec(im.layout, filter.layout, offset.layout, mask.layout, out.layout, + workspace.size); auto algo = get_algorithm(this, im.layout, filter.layout, offset.layout, mask.layout, out.layout); AlgoBase::ExecArgs args(this, im, filter, offset, mask, out, workspace); - - algo->check_workspace(args, workspace).exec(args); - return; + algo->exec(args); } /* ============== BwdFlt Implementation ============== */ @@ -152,21 +151,23 @@ AlgoBwdFlt* BwdFlt::get_algorithm_heuristic( size_t BwdFlt::get_workspace_in_bytes( const TensorLayout& im, const TensorLayout& offset, const TensorLayout& mask, const TensorLayout& out_grad, const TensorLayout& filter_grad) { - auto algo = get_algorithm(this, im, offset, mask, out_grad, filter_grad); - return algo->get_workspace_in_bytes({this, im, offset, mask, out_grad, filter_grad}); + return get_dnn_workspace(this, im, offset, mask, out_grad, filter_grad); } const char* BwdFlt::get_algorithm_set_name() const { return "DEFORMABLE_CONV_BWD_FILTER_CUDA"; }; -void BwdFlt::exec(_megdnn_tensor_in im, _megdnn_tensor_in offset, _megdnn_tensor_in mask, - _megdnn_tensor_in out_grad, _megdnn_tensor_out filter_grad, - _megdnn_workspace workspace) { - AlgoBase::ExecArgs args(this, im, offset, mask, out_grad, filter_grad, workspace); - auto algo = get_algorithm(this, im.layout, offset.layout, mask.layout, out_grad.layout, - filter_grad.layout); - algo->check_workspace(args, workspace).exec(args); +void BwdFlt::exec(_megdnn_tensor_in im, _megdnn_tensor_in offset, + _megdnn_tensor_in mask, _megdnn_tensor_in out_grad, + _megdnn_tensor_out filter_grad, _megdnn_workspace workspace) { + check_exec(im.layout, offset.layout, mask.layout, out_grad.layout, + filter_grad.layout, workspace.size); + AlgoBase::ExecArgs args(this, im, offset, mask, out_grad, filter_grad, + workspace); + auto algo = get_algorithm(this, im.layout, offset.layout, mask.layout, + out_grad.layout, filter_grad.layout); + algo->exec(args); } /* ============== BwdData Implementation ============== */ @@ -222,10 +223,8 @@ size_t BwdData::get_workspace_in_bytes( const TensorLayout& offset, const TensorLayout& mask, const TensorLayout& out_grad, const TensorLayout& im_grad, const TensorLayout& offset_grad, const TensorLayout& mask_grad) { - auto algo = get_algorithm(this, im, filter, offset, mask, out_grad, - im_grad, offset_grad, mask_grad); - return algo->get_workspace_in_bytes({this, im, filter, offset, mask, out_grad, - im_grad, offset_grad, mask_grad}); + return get_dnn_workspace(this, im, filter, offset, mask, out_grad, im_grad, + offset_grad, mask_grad); } const char* BwdData::get_algorithm_set_name() const { @@ -233,16 +232,19 @@ const char* BwdData::get_algorithm_set_name() const { }; void BwdData::exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, - _megdnn_tensor_in offset, _megdnn_tensor_in mask, - _megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, - _megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, - _megdnn_workspace workspace) { + _megdnn_tensor_in offset, _megdnn_tensor_in mask, + _megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, + _megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, + _megdnn_workspace workspace) { + check_exec(im.layout, filter.layout, offset.layout, mask.layout, + out_grad.layout, im_grad.layout, offset_grad.layout, + mask_grad.layout, workspace.size); AlgoBase::ExecArgs args(this, im, filter, offset, mask, out_grad, im_grad, offset_grad, mask_grad, workspace); auto algo = get_algorithm(this, im.layout, filter.layout, offset.layout, mask.layout, out_grad.layout, im_grad.layout, offset_grad.layout, mask_grad.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/cuda/local_share/opr_impl.cpp b/dnn/src/cuda/local_share/opr_impl.cpp index 2a670eaf..786991e2 100644 --- a/dnn/src/cuda/local_share/opr_impl.cpp +++ b/dnn/src/cuda/local_share/opr_impl.cpp @@ -59,17 +59,17 @@ LocalShareForwardImpl::get_all_algorithms(const TensorLayout& src, size_t LocalShareForwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst) { - AlgoBase::SizeArgs args(this, src, filter, dst); - return get_algorithm(this, src, filter, dst)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, filter, dst); } void LocalShareForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, _megdnn_workspace workspace) { + check_exec(src.layout, filter.layout, dst.layout, workspace.size); AlgoBase::ExecArgs args(this, src, filter, dst, workspace); auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } const char* LocalShareForwardImpl::get_algorithm_set_name() const { @@ -112,8 +112,7 @@ LocalShareBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, size_t LocalShareBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, filter, diff, grad)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, filter, diff, grad); } void LocalShareBackwardDataImpl::exec(_megdnn_tensor_in filter, @@ -166,8 +165,7 @@ LocalShareBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, size_t LocalShareBackwardFilterImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, grad)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, diff, grad); } void LocalShareBackwardFilterImpl::exec(_megdnn_tensor_in src, diff --git a/dnn/src/cuda/matrix_mul/opr_impl.cpp b/dnn/src/cuda/matrix_mul/opr_impl.cpp index b2665913..059998df 100644 --- a/dnn/src/cuda/matrix_mul/opr_impl.cpp +++ b/dnn/src/cuda/matrix_mul/opr_impl.cpp @@ -59,8 +59,7 @@ MatrixMulForwardImpl::Algorithm* MatrixMulForwardImpl::get_algorithm_heuristic( size_t MatrixMulForwardImpl::get_workspace_in_bytes(const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { - AlgoBase::SizeArgs args{this, A, B, C}; - return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, A, B, C); } void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, @@ -69,7 +68,7 @@ void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, check_exec(A.layout, B.layout, C.layout, workspace.size); AlgoBase::ExecArgs args(this, A, B, C, workspace); auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } } // namespace cuda diff --git a/dnn/src/cuda/pooling/opr_impl.cpp b/dnn/src/cuda/pooling/opr_impl.cpp index e5efb5e2..88183a0e 100644 --- a/dnn/src/cuda/pooling/opr_impl.cpp +++ b/dnn/src/cuda/pooling/opr_impl.cpp @@ -21,8 +21,7 @@ namespace cuda { size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst) { - AlgoBase::SizeArgs args(this, src, dst); - return get_algorithm(this, src, dst)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, dst); } const char* PoolingForwardImpl::get_algorithm_set_name() const { @@ -117,9 +116,7 @@ size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, src, dst, diff, grad); - return get_algorithm(this, src, dst, diff, grad) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, dst, diff, grad); } } // namespace cuda diff --git a/dnn/src/fallback/batched_matrix_mul/opr_impl.cpp b/dnn/src/fallback/batched_matrix_mul/opr_impl.cpp index 758a6da8..ff32abe3 100644 --- a/dnn/src/fallback/batched_matrix_mul/opr_impl.cpp +++ b/dnn/src/fallback/batched_matrix_mul/opr_impl.cpp @@ -44,8 +44,7 @@ BatchedMatrixMulForwardImpl::get_algorithm_heuristic( size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { - AlgoBase::SizeArgs args{this, A, B, C}; - return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, A, B, C); } void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, @@ -54,7 +53,7 @@ void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, check_exec(A.layout, B.layout, C.layout, workspace.size); AlgoBase::ExecArgs args(this, A, B, C, workspace); auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/fallback/conv_bias/opr_impl.cpp b/dnn/src/fallback/conv_bias/opr_impl.cpp index dd80a2d5..fd41e8bf 100644 --- a/dnn/src/fallback/conv_bias/opr_impl.cpp +++ b/dnn/src/fallback/conv_bias/opr_impl.cpp @@ -224,6 +224,15 @@ size_t ConvBiasImpl::get_workspace_in_bytes( const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { + TensorLayoutArray layouts{src, filter, bias, z, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, preprocessed_filter); auto&& algo = get_algorithm(fparam); diff --git a/dnn/src/fallback/convolution/opr_impl.cpp b/dnn/src/fallback/convolution/opr_impl.cpp index eb05cb14..9551e3ea 100644 --- a/dnn/src/fallback/convolution/opr_impl.cpp +++ b/dnn/src/fallback/convolution/opr_impl.cpp @@ -146,6 +146,15 @@ size_t ConvolutionImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter* preprocessed_filter) { + TensorLayoutArray layouts{src, filter, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + auto fparam = make_ncb_kern_size_param(src, filter, dst, preprocessed_filter); auto&& algo = get_algorithm(fparam); @@ -494,6 +503,15 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { + TensorLayoutArray layouts{filter, diff, grad}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + if (param().format == param::Convolution::Format::NHWCD4 || param().format == param::Convolution::Format::NCHW4 || (param().format == param::Convolution::Format::NCHW && diff --git a/dnn/src/fallback/matrix_mul/opr_impl.cpp b/dnn/src/fallback/matrix_mul/opr_impl.cpp index 3b14d1ed..82ef1591 100644 --- a/dnn/src/fallback/matrix_mul/opr_impl.cpp +++ b/dnn/src/fallback/matrix_mul/opr_impl.cpp @@ -219,6 +219,15 @@ MatrixMulImpl::KernParam MatrixMulImpl::make_kern_param( size_t MatrixMulImpl::get_workspace_in_bytes(const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { + TensorLayoutArray layouts{A, B, C}; + HeuristicCache::Key key{this->handle(),this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + if (auto algo = get_algorithm_heuristic( A, B, C, std::numeric_limits::max(), AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT)) { diff --git a/dnn/src/naive/batch_conv_bias/opr_impl.cpp b/dnn/src/naive/batch_conv_bias/opr_impl.cpp index a2479f32..26d1bdc8 100644 --- a/dnn/src/naive/batch_conv_bias/opr_impl.cpp +++ b/dnn/src/naive/batch_conv_bias/opr_impl.cpp @@ -15,6 +15,7 @@ #include "src/naive/convolution/helper.h" #include +#include "megdnn/heuristic_cache.h" #include "src/common/utils.h" #include "src/naive/handle.h" @@ -56,6 +57,14 @@ size_t BatchConvBiasForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& flt, const TensorLayout& bias, const TensorLayout& z, const TensorLayout& dst) { + TensorLayoutArray layouts{src, flt, bias, z, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } return get_workspace_bundle(nullptr, src, flt, bias, z, dst) .total_size_in_bytes(); } diff --git a/dnn/src/naive/conv_bias/opr_impl.cpp b/dnn/src/naive/conv_bias/opr_impl.cpp index 649b5dca..8c31744a 100644 --- a/dnn/src/naive/conv_bias/opr_impl.cpp +++ b/dnn/src/naive/conv_bias/opr_impl.cpp @@ -13,6 +13,7 @@ #include "src/naive/convolution/helper.h" #include +#include "megdnn/heuristic_cache.h" #include "megdnn/dtype.h" #include "src/common/conv_bias.h" #include "src/common/opr_delegate.h" @@ -201,6 +202,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& z, const TensorLayout& dst, const PreprocessedFilter*) { + TensorLayoutArray layouts{src, flt, bias, z, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + size_t float_workspace_size = 0; if (z.ndim > 0 && z.dtype.category() != DTypeCategory::FLOAT) { diff --git a/dnn/src/naive/convolution/convolution.cpp b/dnn/src/naive/convolution/convolution.cpp index 834a7ca5..658e908e 100644 --- a/dnn/src/naive/convolution/convolution.cpp +++ b/dnn/src/naive/convolution/convolution.cpp @@ -11,7 +11,7 @@ #include "./opr_impl.h" #include "./helper.h" -#include "src/naive/handle.h" +#include "megdnn/heuristic_cache.h" #include "src/naive/handle.h" #include "src/common/utils.h" #include "megdnn/dtype.h" @@ -78,6 +78,15 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { + TensorLayoutArray layouts{filter, diff, grad}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + size_t workspace_size = 0; auto flt_dt = filter.dtype.enumv(); auto grad_dt = grad.dtype.enumv(); @@ -191,6 +200,15 @@ size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( const TensorLayout& grad) { size_t workspace_size = 0; #if !MEGDNN_DISABLE_FLOAT16 + TensorLayoutArray layouts{src, diff, grad}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + auto src_dt = src.dtype.enumv(); auto grad_dt = grad.dtype.enumv(); auto diff_dt = diff.dtype.enumv(); diff --git a/dnn/src/naive/pooling/opr_impl.cpp b/dnn/src/naive/pooling/opr_impl.cpp index 5242d4f9..2c8dd35d 100644 --- a/dnn/src/naive/pooling/opr_impl.cpp +++ b/dnn/src/naive/pooling/opr_impl.cpp @@ -12,6 +12,7 @@ #include "src/naive/pooling/opr_impl.h" #include +#include "megdnn/heuristic_cache.h" #include "megdnn/dtype.h" #include "src/common/utils.h" #include "src/naive/handle.h" @@ -402,6 +403,14 @@ WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst) { + TensorLayoutArray layouts{src, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes(); } namespace { @@ -652,6 +661,14 @@ WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( size_t PoolingBackwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, const TensorLayout& grad) { + TensorLayoutArray layouts{src, dst, diff, grad}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } return get_workspace_bundle(nullptr, src, dst, diff, grad) .total_size_in_bytes(); } diff --git a/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp b/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp index b67ba8ed..b1a7c16c 100644 --- a/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp +++ b/dnn/src/rocm/batched_matrix_mul/opr_impl.cpp @@ -47,8 +47,7 @@ BatchedMatrixMulForwardImpl::get_algorithm_heuristic( size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { - AlgoBase::SizeArgs args{this, A, B, C}; - return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, A, B, C); } void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, @@ -57,7 +56,7 @@ void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, check_exec(A.layout, B.layout, C.layout, workspace.size); AlgoBase::ExecArgs args(this, A, B, C, workspace); auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/rocm/convolution/opr_impl.cpp b/dnn/src/rocm/convolution/opr_impl.cpp index a758019d..b7d79039 100644 --- a/dnn/src/rocm/convolution/opr_impl.cpp +++ b/dnn/src/rocm/convolution/opr_impl.cpp @@ -112,19 +112,30 @@ ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, size_t ConvolutionForwardImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, const PreprocessedFilter*) { + TensorLayoutArray layouts{src, filter, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + AlgoBase::SizeArgs args(this, src, filter, dst); - return get_algorithm(this, src, args.filter_meta, dst) + return get_algorithm(this, src, filter, dst) ->get_workspace_in_bytes(args); } void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, _megdnn_tensor_out dst, - const PreprocessedFilter*, + const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) { + check_exec(src.layout, filter.layout, dst.layout, workspace.size, + preprocessed_filter); AlgoBase::ExecArgs args(this, src, filter, dst, workspace); - auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout); - algo->check_workspace(args, workspace).exec(args); + auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); + algo->exec(args); } const char* ConvolutionForwardImpl::get_algorithm_set_name() const { @@ -137,9 +148,10 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { + check_exec(filter.layout, diff.layout, grad.layout, workspace.size); AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); - auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout); - algo->check_workspace(args, workspace).exec(args); + auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); + algo->exec(args); } std::vector @@ -192,8 +204,17 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad) { + TensorLayoutArray layouts{filter, diff, grad}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + AlgoBase::SizeArgs args(this, filter, diff, grad); - return get_algorithm(this, args.filter_meta, diff, grad) + return get_algorithm(this, filter, diff, grad) ->get_workspace_in_bytes(args); } @@ -207,10 +228,11 @@ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) { + check_exec(src.layout, diff.layout, grad.layout, workspace.size); AlgoBase::ExecArgs args(this, src, diff, grad, workspace); auto algo = - get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta); - algo->check_workspace(args, workspace).exec(args); + get_algorithm(this, src.layout, diff.layout, grad.layout); + algo->exec(args); } std::vector @@ -264,8 +286,17 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( const TensorLayout& src, const TensorLayout& diff, const TensorLayout& grad) { + TensorLayoutArray layouts{src, diff, grad}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + AlgoBase::SizeArgs args(this, src, diff, grad); - return get_algorithm(this, src, diff, args.grad_filter_meta) + return get_algorithm(this, src, diff, grad) ->get_workspace_in_bytes(args); } diff --git a/dnn/src/rocm/convolution/opr_impl.h b/dnn/src/rocm/convolution/opr_impl.h index 9bcd94e4..8aaca2c8 100644 --- a/dnn/src/rocm/convolution/opr_impl.h +++ b/dnn/src/rocm/convolution/opr_impl.h @@ -24,7 +24,7 @@ public: const PreprocessedFilter* preprocessed_filter, _megdnn_workspace workspace) override; AlgorithmInfo get_algorithm_info_heuristic( - const TensorLayout& src, const CanonizedFilterMeta& filter, + const TensorLayout& src, const TensorLayout& filter, const TensorLayout& dst, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { @@ -95,7 +95,7 @@ public: void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, _megdnn_tensor_out grad, _megdnn_workspace workspace) override; AlgorithmInfo get_algorithm_info_heuristic( - const CanonizedFilterMeta& filter, const TensorLayout& diff, + const TensorLayout& filter, const TensorLayout& diff, const TensorLayout& grad, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { @@ -145,7 +145,7 @@ public: _megdnn_tensor_out grad, _megdnn_workspace workspace) override; AlgorithmInfo get_algorithm_info_heuristic( const TensorLayout& src, const TensorLayout& diff, - const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes, + const TensorLayout& grad, size_t workspace_limit_in_bytes, const AlgoAttribute& positive_attr, const AlgoAttribute& negative_attr) { return get_algorithm_heuristic(src, diff, grad, diff --git a/dnn/src/rocm/matrix_mul/opr_impl.cpp b/dnn/src/rocm/matrix_mul/opr_impl.cpp index b233a96f..e423d538 100644 --- a/dnn/src/rocm/matrix_mul/opr_impl.cpp +++ b/dnn/src/rocm/matrix_mul/opr_impl.cpp @@ -44,8 +44,7 @@ MatrixMulForwardImpl::Algorithm* MatrixMulForwardImpl::get_algorithm_heuristic( size_t MatrixMulForwardImpl::get_workspace_in_bytes(const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { - AlgoBase::SizeArgs args{this, A, B, C}; - return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, A, B, C); } void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, @@ -54,7 +53,7 @@ void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, check_exec(A.layout, B.layout, C.layout, workspace.size); AlgoBase::ExecArgs args(this, A, B, C, workspace); auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); - algo->check_workspace(args, workspace).exec(args); + algo->exec(args); } // vim: syntax=cpp.doxygen diff --git a/dnn/src/rocm/pooling/opr_impl.cpp b/dnn/src/rocm/pooling/opr_impl.cpp index ae49eba9..074a1dbc 100644 --- a/dnn/src/rocm/pooling/opr_impl.cpp +++ b/dnn/src/rocm/pooling/opr_impl.cpp @@ -19,8 +19,7 @@ namespace rocm { size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst) { - AlgoBase::SizeArgs args(this, src, dst); - return get_algorithm(this, src, dst)->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, dst); } const char* PoolingForwardImpl::get_algorithm_set_name() const { @@ -69,9 +68,7 @@ size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst, const TensorLayout& diff, const TensorLayout& grad) { - AlgoBase::SizeArgs args(this, src, dst, diff, grad); - return get_algorithm(this, src, dst, diff, grad) - ->get_workspace_in_bytes(args); + return get_dnn_workspace(this, src, dst, diff, grad); }; const char* PoolingBackwardImpl::get_algorithm_set_name() const { diff --git a/dnn/src/x86/pooling/opr_impl.cpp b/dnn/src/x86/pooling/opr_impl.cpp index c6601d2b..6178999b 100644 --- a/dnn/src/x86/pooling/opr_impl.cpp +++ b/dnn/src/x86/pooling/opr_impl.cpp @@ -46,6 +46,15 @@ WorkspaceBundle megdnn::x86::get_bundle(const TensorLayout& src, size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, const TensorLayout& dst) { + TensorLayoutArray layouts{src, dst}; + HeuristicCache::Key key{this->handle(), this->get_opr_type(), + layouts.data(), layouts.size(), &this->param(), + sizeof(this->param())}; + auto rst = HeuristicCache::instance().get(key); + if (rst.policy.algo.valid()) { + return rst.workspace; + } + auto algo = get_algorithm(this, src, dst); if (!is_fallback_algo(algo)) { if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && diff --git a/src/core/test/graph/misc.cpp b/src/core/test/graph/misc.cpp index d7d724b1..e0e0810f 100644 --- a/src/core/test/graph/misc.cpp +++ b/src/core/test/graph/misc.cpp @@ -29,6 +29,7 @@ #include "megbrain/plugin/profiler.h" #include "megbrain/test/helper.h" +#include "megdnn/heuristic_cache.h" #include "megdnn/oprs/base.h" #include @@ -2075,10 +2076,12 @@ void test_free_memory_in_weight_preprocess(int record_level, CompNode cn) { TEST(TestGraph, FreeMemoryInWeightPreprocess) { test_free_memory_in_weight_preprocess(0, CompNode::load("xpu0")); + megdnn::HeuristicCache::instance().clear(); } TEST(TestGraph, RecordFreeMemoryInWeightPreprocess) { test_free_memory_in_weight_preprocess(1, CompNode::load("cpu0")); + megdnn::HeuristicCache::instance().clear(); } namespace { @@ -2157,6 +2160,7 @@ TEST(TestGraph, FreeMemoryInWeightPreprocessWithValueInfer) { ->cast_final_safe() .get_dev_tensor() .empty()); + megdnn::HeuristicCache::instance().clear(); } TEST(TestGraph, FreeMemoryInWeightPreprocessWithMultiReader) { @@ -2200,6 +2204,7 @@ TEST(TestGraph, FreeMemoryInWeightPreprocessWithMultiReader) { ->cast_final_safe() .get_dev_tensor() .empty()); + megdnn::HeuristicCache::instance().clear(); } TEST(TestGraph, FreeBias) { diff --git a/src/opr/impl/search_policy/algo_chooser.cpp b/src/opr/impl/search_policy/algo_chooser.cpp index 4be461f7..ffefee25 100644 --- a/src/opr/impl/search_policy/algo_chooser.cpp +++ b/src/opr/impl/search_policy/algo_chooser.cpp @@ -24,6 +24,7 @@ //! TODO: here has to be know some megdnn::opr when there is produced midout.h //! fix it if there is another graceful way. +#include "megdnn/heuristic_cache.h" #include "megdnn/opr_param_defs.h" #include "megdnn/oprs.h" #include "megdnn/oprs/base.h" @@ -1156,6 +1157,15 @@ template size_t AlgoChooser::setup_algo(const FixedTensorLayouts& layouts, Opr* megdnn_opr, const MGBOpr* mgb_opr, bool allow_weight_preprocess) { + HeuristicCache::Key cache_key( + megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), + layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); + auto rst = HeuristicCache::instance().get(cache_key); + if (rst.policy.algo.valid()) { + megdnn_opr->execution_policy() = rst.policy; + return rst.workspace; + } + if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { return 0; } @@ -1192,6 +1202,11 @@ size_t AlgoChooser::setup_algo(const FixedTensorLayouts& layouts, mgb_log_debug("%s", ret.c_str()); megdnn_opr->execution_policy() = policy; + + if (mgb_opr->execution_policy().strategy & ExecutionStrategy::HEURISTIC) { + HeuristicCache::Result cache_result{policy, workspace}; + HeuristicCache::instance().put(cache_key, cache_result); + } return workspace; } diff --git a/src/opr/test/algo_chooser.cpp b/src/opr/test/algo_chooser.cpp index 967d631d..2a31616d 100644 --- a/src/opr/test/algo_chooser.cpp +++ b/src/opr/test/algo_chooser.cpp @@ -22,6 +22,7 @@ #include "megbrain/opr/tensor_manip.h" #include "megdnn/oprs/base.h" #include "megdnn/dtype.h" +#include "megdnn/heuristic_cache.h" #include #include @@ -337,6 +338,7 @@ void test_no_profiling_on_shape_change(const TensorShapeArray& inps0, TEST(TestOprDNN, FastrunNoProfilingOnShapeChange) { REQUIRE_GPU(1); + megdnn::HeuristicCache::instance().clear(); test_no_profiling_on_shape_change( {{12, 3, 36, 36}, {4, 3, 3, 3}}, {{32, 3, 28, 28}, {4, 3, 3, 3}}); diff --git a/src/opr/test/dnn/convolution.cpp b/src/opr/test/dnn/convolution.cpp index 88645a48..1b7c8ac3 100644 --- a/src/opr/test/dnn/convolution.cpp +++ b/src/opr/test/dnn/convolution.cpp @@ -21,6 +21,7 @@ #include "megbrain/gopt/inference.h" #include "megbrain/opr/tensor_manip.h" #include "megdnn/dtype.h" +#include "megdnn/heuristic_cache.h" #include "megdnn/oprs/base.h" #include @@ -396,6 +397,7 @@ TEST(TestOprDNN, ConvBiasExePolicy) { #endif run(strategy); } + megdnn::HeuristicCache::instance().clear(); ASSERT_THROW(run(S::OPTIMIZED | S::PROFILE), MegBrainError); PersistentCache::set_impl(orig_impl); } @@ -460,6 +462,7 @@ TEST(TestOprDNN, ConvolutionExePolicy) { for (auto strategy : SmallVector{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) { #endif + megdnn::HeuristicCache::instance().clear(); using Checker = AutoOprChecker<2, 1>; auto make_graph = [&](const Checker::SymInpArray& inputs) @@ -489,6 +492,7 @@ TEST(TestOprDNN, ConvolutionExePolicy) { } else { ASSERT_LT(0, nr_get); } + megdnn::HeuristicCache::instance().clear(); } } @@ -544,6 +548,7 @@ TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) { #else for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) { #endif + megdnn::HeuristicCache::instance().clear(); using Checker = AutoOprChecker<2, 1>; auto make_graph = [&](const Checker::SymInpArray& inputs) @@ -1835,6 +1840,7 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) { auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1, size_t sw = 1, size_t sgh = 3, size_t sgw = 3) { + megdnn::HeuristicCache::instance().clear(); size_t ph = fh / 2, pw = fw / 2; param.pad_h = ph, param.pad_w = pw; param.stride_h = sh, param.stride_w = sw, @@ -2289,6 +2295,7 @@ TEST(TestOprDNN, HeuristicReproducible) { } algo_name0 = palgo->name(); } + megdnn::HeuristicCache::instance().clear(); { Checker checker(make_graph, fwd); checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) @@ -2306,6 +2313,7 @@ TEST(TestOprDNN, HeuristicReproducible) { algo_name1 = palgo->name(); } EXPECT_TRUE(algo_name0 == algo_name1); + megdnn::HeuristicCache::instance().clear(); } #undef inp_tensor #undef get_shp @@ -2585,6 +2593,7 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) { } TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { + megdnn::HeuristicCache::instance().clear(); using ::testing::_; using ::testing::Return; using ::testing::Field;