@@ -0,0 +1,92 @@ | |||
/** | |||
* \file dnn/include/megdnn/heuristic_cache.h | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
#include "megdnn/basic_types.h" | |||
#include "megdnn/oprs/base.h" | |||
#include <mutex> | |||
#include <string> | |||
#include <unordered_map> | |||
namespace megdnn { | |||
class HeuristicCache { | |||
private: | |||
HeuristicCache() = default; | |||
public: | |||
static HeuristicCache& instance(); | |||
struct KeyStorage { | |||
std::string category; | |||
std::string input; | |||
bool operator==(const KeyStorage& k) const { | |||
return category == k.category && input == k.input; | |||
} | |||
}; | |||
class Key { | |||
Handle* m_handle; | |||
uint32_t m_opr_type; | |||
const TensorLayout* m_inp_layouts_ptr; | |||
size_t m_inp_layouts_size; | |||
const void* m_param_ptr; | |||
size_t m_param_size; | |||
mutable std::string m_category; | |||
mutable std::string m_input; | |||
public: | |||
Key(Handle* opr_handle, Algorithm::OprType opr_type, const TensorLayout* inp_layouts_ptr, | |||
size_t inp_layouts_size, const void* param_ptr = nullptr, size_t param_size = 0) | |||
: m_handle{opr_handle}, | |||
m_opr_type{static_cast<uint32_t>(opr_type)}, | |||
m_inp_layouts_ptr{inp_layouts_ptr}, | |||
m_inp_layouts_size{inp_layouts_size}, | |||
m_param_ptr{param_ptr}, | |||
m_param_size{param_size} {} | |||
KeyStorage build_key_storage() const; | |||
}; | |||
struct Result { | |||
ExecutionPolicy policy; | |||
size_t workspace; | |||
}; | |||
void put(const Key& key, Result& result); | |||
Result get(const Key& key); | |||
void clear(); | |||
private: | |||
struct Hash { | |||
size_t operator()(const KeyStorage& k) const { | |||
size_t h1 = std::hash<std::string>{}(k.category); | |||
size_t h2 = std::hash<std::string>{}(k.input); | |||
h1 ^= h2 + 0x9e3779b9 + (h1 << 6) + (h1 >> 2); | |||
return h1; | |||
} | |||
}; | |||
std::unordered_map<KeyStorage, Result, Hash> m_heuristic_cache; | |||
#if __DEPLOY_ON_XP_SP2__ | |||
size_t m_mtx; | |||
#else | |||
std::mutex m_mtx; | |||
#endif | |||
}; | |||
} // namespace megdnn |
@@ -42,6 +42,10 @@ public: | |||
const TensorLayout& B, | |||
const TensorLayout& C) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::BATCHED_MATRIX_MUL_FORWARD; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& A, const TensorLayout& B, | |||
const TensorLayout& C, size_t workspace_in_bytes); | |||
@@ -76,6 +80,11 @@ public: | |||
const TensorLayout& C) = 0; | |||
static size_t pack_size (const Param::Format format); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::MATRIX_MUL_FORWARD; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& A, const TensorLayout& B, | |||
const TensorLayout& C, size_t workspace_in_bytes); | |||
@@ -275,6 +275,10 @@ public: | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVOLUTION_FORWARD; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
@@ -309,6 +313,10 @@ public: | |||
void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, | |||
TensorLayout& grad); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVOLUTION_BACKWARD_DATA; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& filter, | |||
const TensorLayout& diff, | |||
@@ -338,6 +346,10 @@ public: | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVOLUTION_BACKWARD_FILTER; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& src, | |||
const TensorLayout& diff, | |||
@@ -505,6 +517,10 @@ public: | |||
const ConvBiasForward::BiasMode bias_mode, | |||
const param::ConvBias::NonlineMode nonline_mode); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVBIAS_FORWARD; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
@@ -775,6 +791,10 @@ public: | |||
virtual size_t get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::POOLING_FORWARD; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& src, const TensorLayout& dst, | |||
size_t workspace_in_bytes); | |||
@@ -801,6 +821,10 @@ public: | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::POOLING_BACKWARD; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& src, const TensorLayout& dst, | |||
const TensorLayout& diff, const TensorLayout& grad, | |||
@@ -1216,6 +1240,10 @@ public: | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVOLUTION3D_FORWARD; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
@@ -1244,6 +1272,10 @@ public: | |||
void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, | |||
TensorLayout& grad); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVOLUTION3D_BACKWARD_DATA; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& filter, | |||
const TensorLayout& diff, | |||
@@ -1268,6 +1300,10 @@ public: | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::CONVOLUTION3D_BACKWARD_FILTER; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& src, | |||
const TensorLayout& diff, | |||
@@ -1308,6 +1344,10 @@ public: | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::LOCAL_SHARE_FORWARD; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, size_t workspace_in_bytes); | |||
@@ -1334,6 +1374,10 @@ public: | |||
void deduce_layout(const TensorLayout& filter, const TensorLayout& diff, | |||
TensorLayout& grad); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::LOCAL_SHARE_BACKWARD_DATA; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad, size_t workspace_in_bytes); | |||
@@ -1358,6 +1402,10 @@ public: | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::LOCAL_SHARE_BACKWARD_FILTER; | |||
} | |||
protected: | |||
void check_exec(const TensorLayout& src, const TensorLayout& diff, | |||
const TensorLayout& grad, size_t workspace_in_bytes); | |||
@@ -1479,6 +1527,10 @@ public: | |||
const TensorLayout& mask, | |||
const TensorLayout& dst) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::DEFORMABLE_CONV_FORWARD; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& im, | |||
const TensorLayout& filter, | |||
@@ -1520,6 +1572,10 @@ public: | |||
const TensorLayout& mask, const TensorLayout& out_grad, | |||
TensorLayout& filter_grad); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::DEFORMABLE_CONV_BACKWARD_FILTER; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& im, | |||
const TensorLayout& offset, | |||
@@ -1566,6 +1622,10 @@ public: | |||
const TensorLayout& out_grad, TensorLayout& im_grad, | |||
TensorLayout& offset_grad, TensorLayout& mask_grad); | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::DEFORMABLE_CONV_BACKWARD_DATA; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec( | |||
const TensorLayout& im, const TensorLayout& filter, | |||
@@ -1677,6 +1737,10 @@ public: | |||
const TensorLayout& z, | |||
const TensorLayout& dst) = 0; | |||
static Algorithm::OprType get_opr_type() { | |||
return Algorithm::OprType::BATCH_CONV_FORWARD; | |||
} | |||
protected: | |||
CanonizedFilterMeta check_exec(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
@@ -101,6 +101,15 @@ PoolingImpl::PoolingKernParam PoolingImpl::make_pooling_kern_param( | |||
size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
TensorLayoutArray layouts{src, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
auto param = make_pooling_kern_szie_param(this, src, dst); | |||
auto algo = get_algorithm(this, src, dst); | |||
if (!is_fallback_algo(algo)) { | |||
@@ -6,7 +6,8 @@ | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#pragma once | |||
@@ -17,10 +18,28 @@ | |||
#include <vector> | |||
#include "megdnn/common.h" | |||
#include "megdnn/heuristic_cache.h" | |||
#include "utils.h" | |||
namespace megdnn { | |||
template <class Opr, typename... Args> | |||
size_t get_dnn_workspace(Opr* opr, Args&&... args) { | |||
TensorLayoutArray layouts{{args...}}; | |||
HeuristicCache::Key key{opr->handle(), opr->get_opr_type(), | |||
layouts.data(), layouts.size(), &opr->param(), | |||
sizeof(opr->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
typename Opr::AlgoBase::SizeArgs size_args(opr, | |||
std::forward<Args>(args)...); | |||
return get_algorithm(opr, std::forward<Args>(args)...) | |||
->get_workspace_in_bytes(size_args); | |||
} | |||
/*! | |||
* \brief get user-configured algorithm, or heuristic algorithm | |||
*/ | |||
@@ -31,9 +50,20 @@ typename Opr::AlgoBase* get_algorithm(Opr* opr, Args&&... args) { | |||
if (set.valid()) { | |||
ret = set; | |||
} else { | |||
ret = opr->get_algorithm_info_heuristic( | |||
std::forward<Args>(args)..., std::numeric_limits<size_t>::max(), | |||
AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT).desc; | |||
TensorLayoutArray layouts{{args...}}; | |||
HeuristicCache::Key key{opr->handle(), opr->get_opr_type(), | |||
layouts.data(), layouts.size(), &opr->param(), | |||
sizeof(opr->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
ret = rst.policy.algo; | |||
} else { | |||
ret = opr->get_algorithm_info_heuristic( | |||
std::forward<Args>(args)..., | |||
std::numeric_limits<size_t>::max(), | |||
AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT) | |||
.desc; | |||
} | |||
} | |||
return static_cast<typename Opr::AlgoBase*>( | |||
opr->get_algorithm_from_desc(ret)); | |||
@@ -250,13 +250,9 @@ CanonizedFilterMeta DeformableConvBackwardData::check_exec( | |||
megdnn_assert_eq_dtype(im, mask_grad); | |||
// check layout | |||
megdnn_assert(im.shape == im_grad.shape, "invalid im_grad shape: %s", | |||
megdnn_layout_msg(im_grad).c_str()); | |||
megdnn_assert(offset.shape == offset_grad.shape, | |||
"invalid offset_grad shape: %s", | |||
megdnn_layout_msg(offset_grad).c_str()); | |||
megdnn_assert(mask.shape == mask_grad.shape, "invalid mask_grad shape: %s", | |||
megdnn_layout_msg(mask_grad).c_str()); | |||
megdnn_assert_eq_shape(im, im_grad); | |||
megdnn_assert_eq_shape(offset, offset_grad); | |||
megdnn_assert_eq_shape(mask, mask_grad); | |||
auto ret = make_canonized_filter_meta(im.ndim, filter, offset); | |||
auto required_workspace_in_bytes = | |||
@@ -0,0 +1,142 @@ | |||
/** | |||
* \file dnn/src/common/heuristic_cache.cpp | |||
* MegEngine is Licensed under the Apache License, Version 2.0 (the "License") | |||
* | |||
* Copyright (c) 2014-2021 Megvii Inc. All rights reserved. | |||
* | |||
* Unless required by applicable law or agreed to in writing, | |||
* software distributed under the License is distributed on an | |||
* "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or | |||
* implied. | |||
*/ | |||
#include "megdnn/heuristic_cache.h" | |||
#include "src/common/utils.h" | |||
#include "src/naive/handle.h" | |||
#if MEGDNN_WITH_CUDA | |||
#include "src/cuda/utils.h" | |||
#endif | |||
#if MEGDNN_WITH_ROCM | |||
#include "hcc_detail/hcc_defs_prologue.h" | |||
#include "megcore_rocm.h" | |||
#include "src/rocm/utils.h" | |||
#endif | |||
using namespace megdnn; | |||
HeuristicCache& HeuristicCache::instance() { | |||
static HeuristicCache ins; | |||
return ins; | |||
} | |||
HeuristicCache::KeyStorage HeuristicCache::Key::build_key_storage() const { | |||
auto&& ctg = m_category; | |||
auto&& inp = m_input; | |||
if (!m_category.empty() && !m_input.empty()) | |||
return {ctg, inp}; | |||
inp.reserve(sizeof(TensorLayout) * 3 * m_inp_layouts_size + m_param_size); | |||
for (size_t i = 0; i < m_inp_layouts_size; i++) { | |||
auto&& ly = m_inp_layouts_ptr[i]; | |||
for (size_t j = 0; j < ly.ndim; j++) { | |||
if (j) | |||
inp.push_back(','); | |||
inp.append(std::to_string(ly.shape[j])); | |||
} | |||
inp.push_back(';'); | |||
for (size_t j = 0; j < ly.ndim; j++) { | |||
if (j) | |||
inp.push_back(','); | |||
inp.append(std::to_string(ly.stride[j])); | |||
} | |||
inp.push_back(';'); | |||
inp.append(ly.dtype.name()); | |||
inp.push_back(';'); | |||
inp.append(ly.format.to_string().c_str()); | |||
inp.push_back('|'); | |||
} | |||
if (m_param_size) { | |||
inp.append(reinterpret_cast<const char*>(m_param_ptr), m_param_size); | |||
} | |||
ctg = "plat:"; | |||
ctg.append(std::to_string(static_cast<uint32_t>(m_handle->type()))); | |||
switch (m_handle->type()) { | |||
#if MEGDNN_WITH_CUDA | |||
case Handle::HandleType::CUDA: { | |||
int cuda_rt = -1; | |||
cuda_check(cudaRuntimeGetVersion(&cuda_rt)); | |||
cuda_rt /= 1000; | |||
auto&& handle = static_cast<megdnn::cuda::HandleImpl*>(m_handle); | |||
auto&& prop = handle->device_prop(); | |||
ctg.append(ssprintf(";dev=%s;cap=%d.%d;runtime=%d;", | |||
prop.name, prop.major, prop.minor, cuda_rt)); | |||
break; | |||
} | |||
#endif | |||
#if MEGDNN_WITH_ROCM | |||
case Handle::HandleType::ROCM: { | |||
auto&& handle = static_cast<megdnn::rocm::HandleImpl*>(m_handle); | |||
auto&& prop = handle->device_prop(); | |||
int drv = -1, hip_rt = -1; | |||
hip_check(hipDriverGetVersion(&drv)); | |||
hip_check(hipRuntimeGetVersion(&hip_rt)); | |||
ctg.append(ssprintf(";dev=%s;cap=%d.%d,drv=%d;runtime=%d;", | |||
prop.name, prop.major, prop.minor, drv, hip_rt)); | |||
break; | |||
} | |||
#endif | |||
case Handle::HandleType::FALLBACK: | |||
#if MEGDNN_X86 | |||
case Handle::HandleType::X86: | |||
#endif | |||
#if MEGDNN_AARCH64 || MEGDNN_ARMV7 | |||
case Handle::HandleType::ARM_COMMON: | |||
#endif | |||
#if MEGDNN_AARCH64 | |||
case Handle::HandleType::AARCH64: | |||
#endif | |||
#if MEGDNN_ARMV7 | |||
case Handle::HandleType::ARMV7: | |||
#endif | |||
{ | |||
size_t nr_threads = | |||
static_cast<megdnn::naive::HandleImpl*>(m_handle) | |||
->megcore_dispatcher() | |||
->nr_threads(); | |||
ctg.append(";"); | |||
ctg.append(std::to_string(nr_threads)); | |||
ctg.append(";"); | |||
break; | |||
} | |||
default: | |||
ctg.append(";"); | |||
} | |||
ctg.append(std::to_string(m_opr_type)); | |||
return {ctg, inp}; | |||
} | |||
void HeuristicCache::put(const Key& key, Result& result) { | |||
MEGDNN_LOCK_GUARD(m_mtx); | |||
if (result.policy.algo.valid()) | |||
m_heuristic_cache[key.build_key_storage()] = result; | |||
} | |||
HeuristicCache::Result HeuristicCache::get(const Key& key) { | |||
MEGDNN_LOCK_GUARD(m_mtx); | |||
KeyStorage ks = key.build_key_storage(); | |||
auto iter = m_heuristic_cache.find(ks); | |||
if (iter == m_heuristic_cache.end()) { | |||
return {}; | |||
} else { | |||
return iter->second; | |||
} | |||
} | |||
void HeuristicCache::clear() { | |||
MEGDNN_LOCK_GUARD(m_mtx); | |||
m_heuristic_cache.clear(); | |||
} |
@@ -56,9 +56,7 @@ size_t BatchConvBiasForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
AlgoBase::SizeArgs args(this, src, filter, bias, z, dst); | |||
return get_algorithm(this, src, filter, bias, z, dst) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, filter, bias, z, dst); | |||
} | |||
void BatchConvBiasForwardImpl::exec(_megdnn_tensor_in src, | |||
@@ -66,10 +64,12 @@ void BatchConvBiasForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in bias, _megdnn_tensor_in z, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, bias.layout, z.layout, dst.layout, | |||
workspace.size); | |||
AlgoBase::ExecArgs args(this, src, filter, bias, z, dst, workspace); | |||
auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | |||
z.layout, dst.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
const char* BatchConvBiasForwardImpl::get_algorithm_set_name() const { | |||
@@ -33,13 +33,12 @@ void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
AlgoBase::ExecArgs args(this, A, B, C, workspace); | |||
check_exec(A.layout, B.layout, C.layout, workspace.size); | |||
auto&& algo = megdnn::get_algorithm(this, A.layout, B.layout, C.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { | |||
AlgoBase::SizeArgs args(this, A, B, C); | |||
return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, A, B, C); | |||
} | |||
std::vector<Algorithm*> BatchedMatrixMulForwardImpl::get_all_algorithms( | |||
@@ -36,7 +36,7 @@ void ConvBiasForwardImpl::exec(_megdnn_tensor_in src, _megdnn_tensor_in filter, | |||
preprocessed_filter); | |||
auto algo = get_algorithm(this, src.layout, filter.layout, bias.layout, | |||
z.layout, dst.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
}; | |||
std::vector<ConvBiasForward::Algorithm*> | |||
@@ -228,6 +228,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
TensorLayoutArray layouts{src, filter, bias, z, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
AlgoBase::SizeArgs args{ | |||
this, src, filter, bias, z, dst, preprocessed_filter}; | |||
return get_algorithm(this, src, filter, bias, z, dst) | |||
@@ -58,9 +58,7 @@ size_t ConvolutionForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
MEGDNN_MARK_USED_VAR(preprocessed_filter); | |||
AlgoBase::SizeArgs args{this, src, filter, dst}; | |||
return megdnn::get_algorithm(this, src, filter, dst) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, filter, dst); | |||
} | |||
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
@@ -72,7 +70,7 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
preprocessed_filter); | |||
AlgoBase::ExecArgs args(this, src, filter, dst, workspace); | |||
auto&& algo = get_algorithm(this, src.layout, filter.layout, dst.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
const char* ConvolutionForwardImpl::get_algorithm_set_name() const { | |||
@@ -85,9 +83,10 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||
_megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(filter.layout, diff.layout, grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | |||
auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
std::vector<ConvolutionBackwardDataImpl::Algorithm*> | |||
@@ -196,9 +195,7 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( | |||
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( | |||
const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, filter, diff, grad); | |||
return get_algorithm(this, filter, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, filter, diff, grad); | |||
} | |||
const char* ConvolutionBackwardDataImpl::get_algorithm_set_name() const { | |||
@@ -211,9 +208,10 @@ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, diff.layout, grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, src, diff, grad, workspace); | |||
auto algo = get_algorithm(this, src.layout, diff.layout, grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
std::vector<ConvolutionBackwardFilterImpl::Algorithm*> | |||
@@ -324,9 +322,7 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||
size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, src, diff, grad); | |||
return get_algorithm(this, src, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, diff, grad); | |||
} | |||
const char* ConvolutionBackwardFilterImpl::get_algorithm_set_name() const { | |||
@@ -111,18 +111,17 @@ Convolution3DForwardImpl::get_all_algorithms(const TensorLayout& src, | |||
size_t Convolution3DForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
AlgoBase::SizeArgs args(this, src, filter, dst); | |||
return get_algorithm(this, src, filter, dst) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, filter, dst); | |||
} | |||
void Convolution3DForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, dst.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, src, filter, dst, workspace); | |||
auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
const char* Convolution3DForwardImpl::get_algorithm_set_name() const { | |||
@@ -133,9 +132,10 @@ void Convolution3DBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||
_megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(filter.layout, diff.layout, grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | |||
auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
std::vector<Convolution3DBackwardDataImpl::Algorithm*> | |||
@@ -200,9 +200,7 @@ Convolution3DBackwardDataImpl::get_algorithm_heuristic( | |||
size_t Convolution3DBackwardDataImpl::get_workspace_in_bytes( | |||
const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, filter, diff, grad); | |||
return get_algorithm(this, filter, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, filter, diff, grad); | |||
} | |||
const char* Convolution3DBackwardDataImpl::get_algorithm_set_name() const { | |||
@@ -213,10 +211,11 @@ void Convolution3DBackwardFilterImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, diff.layout, grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, src, diff, grad, workspace); | |||
auto algo = | |||
get_algorithm(this, src.layout, diff.layout, grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
std::vector<Convolution3DBackwardFilterImpl::Algorithm*> | |||
@@ -281,9 +280,7 @@ Convolution3DBackwardFilterImpl::get_algorithm_heuristic( | |||
size_t Convolution3DBackwardFilterImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, src, diff, grad); | |||
return get_algorithm(this, src, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, diff , grad); | |||
} | |||
const char* Convolution3DBackwardFilterImpl::get_algorithm_set_name() const { | |||
@@ -36,8 +36,7 @@ size_t Fwd::get_workspace_in_bytes(const TensorLayout& im, | |||
const TensorLayout& offset, | |||
const TensorLayout& mask, | |||
const TensorLayout& dst) { | |||
auto algo = get_algorithm(this, im, filter, offset, mask, dst); | |||
return algo->get_workspace_in_bytes({this, im, filter, offset, mask, dst}); | |||
return get_dnn_workspace(this, im, filter, offset, mask, dst); | |||
} | |||
std::vector<AlgoFwd*> Fwd::get_all_algorithms(const TensorLayout& /* im */, | |||
@@ -96,13 +95,13 @@ const char* Fwd::get_algorithm_set_name() const { | |||
void Fwd::exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in offset, _megdnn_tensor_in mask, | |||
_megdnn_tensor_out out, _megdnn_workspace workspace) { | |||
check_exec(im.layout, filter.layout, offset.layout, mask.layout, out.layout, | |||
workspace.size); | |||
auto algo = get_algorithm(this, im.layout, filter.layout, offset.layout, | |||
mask.layout, out.layout); | |||
AlgoBase::ExecArgs args(this, im, filter, offset, mask, out, workspace); | |||
algo->check_workspace(args, workspace).exec(args); | |||
return; | |||
algo->exec(args); | |||
} | |||
/* ============== BwdFlt Implementation ============== */ | |||
@@ -152,21 +151,23 @@ AlgoBwdFlt* BwdFlt::get_algorithm_heuristic( | |||
size_t BwdFlt::get_workspace_in_bytes( | |||
const TensorLayout& im, const TensorLayout& offset, const TensorLayout& mask, | |||
const TensorLayout& out_grad, const TensorLayout& filter_grad) { | |||
auto algo = get_algorithm(this, im, offset, mask, out_grad, filter_grad); | |||
return algo->get_workspace_in_bytes({this, im, offset, mask, out_grad, filter_grad}); | |||
return get_dnn_workspace(this, im, offset, mask, out_grad, filter_grad); | |||
} | |||
const char* BwdFlt::get_algorithm_set_name() const { | |||
return "DEFORMABLE_CONV_BWD_FILTER_CUDA"; | |||
}; | |||
void BwdFlt::exec(_megdnn_tensor_in im, _megdnn_tensor_in offset, _megdnn_tensor_in mask, | |||
_megdnn_tensor_in out_grad, _megdnn_tensor_out filter_grad, | |||
_megdnn_workspace workspace) { | |||
AlgoBase::ExecArgs args(this, im, offset, mask, out_grad, filter_grad, workspace); | |||
auto algo = get_algorithm(this, im.layout, offset.layout, mask.layout, out_grad.layout, | |||
filter_grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
void BwdFlt::exec(_megdnn_tensor_in im, _megdnn_tensor_in offset, | |||
_megdnn_tensor_in mask, _megdnn_tensor_in out_grad, | |||
_megdnn_tensor_out filter_grad, _megdnn_workspace workspace) { | |||
check_exec(im.layout, offset.layout, mask.layout, out_grad.layout, | |||
filter_grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, im, offset, mask, out_grad, filter_grad, | |||
workspace); | |||
auto algo = get_algorithm(this, im.layout, offset.layout, mask.layout, | |||
out_grad.layout, filter_grad.layout); | |||
algo->exec(args); | |||
} | |||
/* ============== BwdData Implementation ============== */ | |||
@@ -222,10 +223,8 @@ size_t BwdData::get_workspace_in_bytes( | |||
const TensorLayout& offset, const TensorLayout& mask, | |||
const TensorLayout& out_grad, const TensorLayout& im_grad, | |||
const TensorLayout& offset_grad, const TensorLayout& mask_grad) { | |||
auto algo = get_algorithm(this, im, filter, offset, mask, out_grad, | |||
im_grad, offset_grad, mask_grad); | |||
return algo->get_workspace_in_bytes({this, im, filter, offset, mask, out_grad, | |||
im_grad, offset_grad, mask_grad}); | |||
return get_dnn_workspace(this, im, filter, offset, mask, out_grad, im_grad, | |||
offset_grad, mask_grad); | |||
} | |||
const char* BwdData::get_algorithm_set_name() const { | |||
@@ -233,16 +232,19 @@ const char* BwdData::get_algorithm_set_name() const { | |||
}; | |||
void BwdData::exec(_megdnn_tensor_in im, _megdnn_tensor_in filter, | |||
_megdnn_tensor_in offset, _megdnn_tensor_in mask, | |||
_megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, | |||
_megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, | |||
_megdnn_workspace workspace) { | |||
_megdnn_tensor_in offset, _megdnn_tensor_in mask, | |||
_megdnn_tensor_in out_grad, _megdnn_tensor_out im_grad, | |||
_megdnn_tensor_out offset_grad, _megdnn_tensor_out mask_grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(im.layout, filter.layout, offset.layout, mask.layout, | |||
out_grad.layout, im_grad.layout, offset_grad.layout, | |||
mask_grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, im, filter, offset, mask, out_grad, im_grad, | |||
offset_grad, mask_grad, workspace); | |||
auto algo = get_algorithm(this, im.layout, filter.layout, offset.layout, | |||
mask.layout, out_grad.layout, im_grad.layout, | |||
offset_grad.layout, mask_grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -59,17 +59,17 @@ LocalShareForwardImpl::get_all_algorithms(const TensorLayout& src, | |||
size_t LocalShareForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& filter, | |||
const TensorLayout& dst) { | |||
AlgoBase::SizeArgs args(this, src, filter, dst); | |||
return get_algorithm(this, src, filter, dst)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, filter, dst); | |||
} | |||
void LocalShareForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, dst.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, src, filter, dst, workspace); | |||
auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
const char* LocalShareForwardImpl::get_algorithm_set_name() const { | |||
@@ -112,8 +112,7 @@ LocalShareBackwardDataImpl::get_all_algorithms(const TensorLayout& filter, | |||
size_t LocalShareBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, filter, diff, grad); | |||
return get_algorithm(this, filter, diff, grad)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, filter, diff, grad); | |||
} | |||
void LocalShareBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||
@@ -166,8 +165,7 @@ LocalShareBackwardFilterImpl::get_all_algorithms(const TensorLayout& src, | |||
size_t LocalShareBackwardFilterImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, src, diff, grad); | |||
return get_algorithm(this, src, diff, grad)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, diff, grad); | |||
} | |||
void LocalShareBackwardFilterImpl::exec(_megdnn_tensor_in src, | |||
@@ -59,8 +59,7 @@ MatrixMulForwardImpl::Algorithm* MatrixMulForwardImpl::get_algorithm_heuristic( | |||
size_t MatrixMulForwardImpl::get_workspace_in_bytes(const TensorLayout& A, | |||
const TensorLayout& B, | |||
const TensorLayout& C) { | |||
AlgoBase::SizeArgs args{this, A, B, C}; | |||
return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, A, B, C); | |||
} | |||
void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
@@ -69,7 +68,7 @@ void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
check_exec(A.layout, B.layout, C.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, A, B, C, workspace); | |||
auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
} // namespace cuda | |||
@@ -21,8 +21,7 @@ namespace cuda { | |||
size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
AlgoBase::SizeArgs args(this, src, dst); | |||
return get_algorithm(this, src, dst)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, dst); | |||
} | |||
const char* PoolingForwardImpl::get_algorithm_set_name() const { | |||
@@ -117,9 +116,7 @@ size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, src, dst, diff, grad); | |||
return get_algorithm(this, src, dst, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, dst, diff, grad); | |||
} | |||
} // namespace cuda | |||
@@ -44,8 +44,7 @@ BatchedMatrixMulForwardImpl::get_algorithm_heuristic( | |||
size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { | |||
AlgoBase::SizeArgs args{this, A, B, C}; | |||
return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, A, B, C); | |||
} | |||
void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
@@ -54,7 +53,7 @@ void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
check_exec(A.layout, B.layout, C.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, A, B, C, workspace); | |||
auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -224,6 +224,15 @@ size_t ConvBiasImpl::get_workspace_in_bytes( | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
TensorLayoutArray layouts{src, filter, bias, z, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
auto fparam = make_ncb_kern_size_param(src, filter, bias, dst, | |||
preprocessed_filter); | |||
auto&& algo = get_algorithm(fparam); | |||
@@ -146,6 +146,15 @@ size_t ConvolutionImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter* preprocessed_filter) { | |||
TensorLayoutArray layouts{src, filter, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
auto fparam = | |||
make_ncb_kern_size_param(src, filter, dst, preprocessed_filter); | |||
auto&& algo = get_algorithm(fparam); | |||
@@ -494,6 +503,15 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( | |||
const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
TensorLayoutArray layouts{filter, diff, grad}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
if (param().format == param::Convolution::Format::NHWCD4 || | |||
param().format == param::Convolution::Format::NCHW4 || | |||
(param().format == param::Convolution::Format::NCHW && | |||
@@ -219,6 +219,15 @@ MatrixMulImpl::KernParam MatrixMulImpl::make_kern_param( | |||
size_t MatrixMulImpl::get_workspace_in_bytes(const TensorLayout& A, | |||
const TensorLayout& B, | |||
const TensorLayout& C) { | |||
TensorLayoutArray layouts{A, B, C}; | |||
HeuristicCache::Key key{this->handle(),this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
if (auto algo = get_algorithm_heuristic( | |||
A, B, C, std::numeric_limits<size_t>::max(), | |||
AlgoAttribute::DEFAULT, AlgoAttribute::DEFAULT)) { | |||
@@ -15,6 +15,7 @@ | |||
#include "src/naive/convolution/helper.h" | |||
#include <cstring> | |||
#include "megdnn/heuristic_cache.h" | |||
#include "src/common/utils.h" | |||
#include "src/naive/handle.h" | |||
@@ -56,6 +57,14 @@ size_t BatchConvBiasForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& flt, | |||
const TensorLayout& bias, const TensorLayout& z, | |||
const TensorLayout& dst) { | |||
TensorLayoutArray layouts{src, flt, bias, z, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
return get_workspace_bundle(nullptr, src, flt, bias, z, dst) | |||
.total_size_in_bytes(); | |||
} | |||
@@ -13,6 +13,7 @@ | |||
#include "src/naive/convolution/helper.h" | |||
#include <cstring> | |||
#include "megdnn/heuristic_cache.h" | |||
#include "megdnn/dtype.h" | |||
#include "src/common/conv_bias.h" | |||
#include "src/common/opr_delegate.h" | |||
@@ -201,6 +202,15 @@ size_t ConvBiasForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& z, | |||
const TensorLayout& dst, | |||
const PreprocessedFilter*) { | |||
TensorLayoutArray layouts{src, flt, bias, z, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
size_t float_workspace_size = 0; | |||
if (z.ndim > 0 && z.dtype.category() != DTypeCategory::FLOAT) { | |||
@@ -11,7 +11,7 @@ | |||
#include "./opr_impl.h" | |||
#include "./helper.h" | |||
#include "src/naive/handle.h" | |||
#include "megdnn/heuristic_cache.h" | |||
#include "src/naive/handle.h" | |||
#include "src/common/utils.h" | |||
#include "megdnn/dtype.h" | |||
@@ -78,6 +78,15 @@ void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes(const TensorLayout& filter, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
TensorLayoutArray layouts{filter, diff, grad}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
size_t workspace_size = 0; | |||
auto flt_dt = filter.dtype.enumv(); | |||
auto grad_dt = grad.dtype.enumv(); | |||
@@ -191,6 +200,15 @@ size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( | |||
const TensorLayout& grad) { | |||
size_t workspace_size = 0; | |||
#if !MEGDNN_DISABLE_FLOAT16 | |||
TensorLayoutArray layouts{src, diff, grad}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
auto src_dt = src.dtype.enumv(); | |||
auto grad_dt = grad.dtype.enumv(); | |||
auto diff_dt = diff.dtype.enumv(); | |||
@@ -12,6 +12,7 @@ | |||
#include "src/naive/pooling/opr_impl.h" | |||
#include <cstring> | |||
#include "megdnn/heuristic_cache.h" | |||
#include "megdnn/dtype.h" | |||
#include "src/common/utils.h" | |||
#include "src/naive/handle.h" | |||
@@ -402,6 +403,14 @@ WorkspaceBundle PoolingForwardImpl::get_workspace_bundle( | |||
size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
TensorLayoutArray layouts{src, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
return get_workspace_bundle(nullptr, src, dst).total_size_in_bytes(); | |||
} | |||
namespace { | |||
@@ -652,6 +661,14 @@ WorkspaceBundle PoolingBackwardImpl::get_workspace_bundle( | |||
size_t PoolingBackwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& dst, | |||
const TensorLayout& diff, const TensorLayout& grad) { | |||
TensorLayoutArray layouts{src, dst, diff, grad}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
return get_workspace_bundle(nullptr, src, dst, diff, grad) | |||
.total_size_in_bytes(); | |||
} | |||
@@ -47,8 +47,7 @@ BatchedMatrixMulForwardImpl::get_algorithm_heuristic( | |||
size_t BatchedMatrixMulForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& A, const TensorLayout& B, const TensorLayout& C) { | |||
AlgoBase::SizeArgs args{this, A, B, C}; | |||
return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, A, B, C); | |||
} | |||
void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
@@ -57,7 +56,7 @@ void BatchedMatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
check_exec(A.layout, B.layout, C.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, A, B, C, workspace); | |||
auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -112,19 +112,30 @@ ConvolutionForwardImpl::get_all_algorithms(const TensorLayout& src, | |||
size_t ConvolutionForwardImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, const PreprocessedFilter*) { | |||
TensorLayoutArray layouts{src, filter, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
AlgoBase::SizeArgs args(this, src, filter, dst); | |||
return get_algorithm(this, src, args.filter_meta, dst) | |||
return get_algorithm(this, src, filter, dst) | |||
->get_workspace_in_bytes(args); | |||
} | |||
void ConvolutionForwardImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in filter, | |||
_megdnn_tensor_out dst, | |||
const PreprocessedFilter*, | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, filter.layout, dst.layout, workspace.size, | |||
preprocessed_filter); | |||
AlgoBase::ExecArgs args(this, src, filter, dst, workspace); | |||
auto algo = get_algorithm(this, src.layout, args.filter_meta, dst.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
auto algo = get_algorithm(this, src.layout, filter.layout, dst.layout); | |||
algo->exec(args); | |||
} | |||
const char* ConvolutionForwardImpl::get_algorithm_set_name() const { | |||
@@ -137,9 +148,10 @@ void ConvolutionBackwardDataImpl::exec(_megdnn_tensor_in filter, | |||
_megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(filter.layout, diff.layout, grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, filter, diff, grad, workspace); | |||
auto algo = get_algorithm(this, args.filter_meta, diff.layout, grad.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
auto algo = get_algorithm(this, filter.layout, diff.layout, grad.layout); | |||
algo->exec(args); | |||
} | |||
std::vector<ConvolutionBackwardDataImpl::Algorithm*> | |||
@@ -192,8 +204,17 @@ ConvolutionBackwardDataImpl::get_algorithm_heuristic( | |||
size_t ConvolutionBackwardDataImpl::get_workspace_in_bytes( | |||
const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
TensorLayoutArray layouts{filter, diff, grad}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
AlgoBase::SizeArgs args(this, filter, diff, grad); | |||
return get_algorithm(this, args.filter_meta, diff, grad) | |||
return get_algorithm(this, filter, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
} | |||
@@ -207,10 +228,11 @@ void ConvolutionBackwardFilterImpl::exec(_megdnn_tensor_in src, | |||
_megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, | |||
_megdnn_workspace workspace) { | |||
check_exec(src.layout, diff.layout, grad.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, src, diff, grad, workspace); | |||
auto algo = | |||
get_algorithm(this, src.layout, diff.layout, args.grad_filter_meta); | |||
algo->check_workspace(args, workspace).exec(args); | |||
get_algorithm(this, src.layout, diff.layout, grad.layout); | |||
algo->exec(args); | |||
} | |||
std::vector<ConvolutionBackwardFilterImpl::Algorithm*> | |||
@@ -264,8 +286,17 @@ ConvolutionBackwardFilterImpl::get_algorithm_heuristic( | |||
size_t ConvolutionBackwardFilterImpl::get_workspace_in_bytes( | |||
const TensorLayout& src, const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
TensorLayoutArray layouts{src, diff, grad}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
AlgoBase::SizeArgs args(this, src, diff, grad); | |||
return get_algorithm(this, src, diff, args.grad_filter_meta) | |||
return get_algorithm(this, src, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
} | |||
@@ -24,7 +24,7 @@ public: | |||
const PreprocessedFilter* preprocessed_filter, | |||
_megdnn_workspace workspace) override; | |||
AlgorithmInfo get_algorithm_info_heuristic( | |||
const TensorLayout& src, const CanonizedFilterMeta& filter, | |||
const TensorLayout& src, const TensorLayout& filter, | |||
const TensorLayout& dst, size_t workspace_limit_in_bytes, | |||
const AlgoAttribute& positive_attr, | |||
const AlgoAttribute& negative_attr) { | |||
@@ -95,7 +95,7 @@ public: | |||
void exec(_megdnn_tensor_in filter, _megdnn_tensor_in diff, | |||
_megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||
AlgorithmInfo get_algorithm_info_heuristic( | |||
const CanonizedFilterMeta& filter, const TensorLayout& diff, | |||
const TensorLayout& filter, const TensorLayout& diff, | |||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | |||
const AlgoAttribute& positive_attr, | |||
const AlgoAttribute& negative_attr) { | |||
@@ -145,7 +145,7 @@ public: | |||
_megdnn_tensor_out grad, _megdnn_workspace workspace) override; | |||
AlgorithmInfo get_algorithm_info_heuristic( | |||
const TensorLayout& src, const TensorLayout& diff, | |||
const CanonizedFilterMeta& grad, size_t workspace_limit_in_bytes, | |||
const TensorLayout& grad, size_t workspace_limit_in_bytes, | |||
const AlgoAttribute& positive_attr, | |||
const AlgoAttribute& negative_attr) { | |||
return get_algorithm_heuristic(src, diff, grad, | |||
@@ -44,8 +44,7 @@ MatrixMulForwardImpl::Algorithm* MatrixMulForwardImpl::get_algorithm_heuristic( | |||
size_t MatrixMulForwardImpl::get_workspace_in_bytes(const TensorLayout& A, | |||
const TensorLayout& B, | |||
const TensorLayout& C) { | |||
AlgoBase::SizeArgs args{this, A, B, C}; | |||
return megdnn::get_algorithm(this, A, B, C)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, A, B, C); | |||
} | |||
void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
@@ -54,7 +53,7 @@ void MatrixMulForwardImpl::exec(_megdnn_tensor_in A, _megdnn_tensor_in B, | |||
check_exec(A.layout, B.layout, C.layout, workspace.size); | |||
AlgoBase::ExecArgs args(this, A, B, C, workspace); | |||
auto&& algo = get_algorithm(this, A.layout, B.layout, C.layout); | |||
algo->check_workspace(args, workspace).exec(args); | |||
algo->exec(args); | |||
} | |||
// vim: syntax=cpp.doxygen |
@@ -19,8 +19,7 @@ namespace rocm { | |||
size_t PoolingForwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
AlgoBase::SizeArgs args(this, src, dst); | |||
return get_algorithm(this, src, dst)->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, dst); | |||
} | |||
const char* PoolingForwardImpl::get_algorithm_set_name() const { | |||
@@ -69,9 +68,7 @@ size_t PoolingBackwardImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst, | |||
const TensorLayout& diff, | |||
const TensorLayout& grad) { | |||
AlgoBase::SizeArgs args(this, src, dst, diff, grad); | |||
return get_algorithm(this, src, dst, diff, grad) | |||
->get_workspace_in_bytes(args); | |||
return get_dnn_workspace(this, src, dst, diff, grad); | |||
}; | |||
const char* PoolingBackwardImpl::get_algorithm_set_name() const { | |||
@@ -46,6 +46,15 @@ WorkspaceBundle megdnn::x86::get_bundle(const TensorLayout& src, | |||
size_t PoolingImpl::get_workspace_in_bytes(const TensorLayout& src, | |||
const TensorLayout& dst) { | |||
TensorLayoutArray layouts{src, dst}; | |||
HeuristicCache::Key key{this->handle(), this->get_opr_type(), | |||
layouts.data(), layouts.size(), &this->param(), | |||
sizeof(this->param())}; | |||
auto rst = HeuristicCache::instance().get(key); | |||
if (rst.policy.algo.valid()) { | |||
return rst.workspace; | |||
} | |||
auto algo = get_algorithm(this, src, dst); | |||
if (!is_fallback_algo(algo)) { | |||
if (is_supported(SIMDType::SSE) && src.dtype == dtype::Float32() && | |||
@@ -29,6 +29,7 @@ | |||
#include "megbrain/plugin/profiler.h" | |||
#include "megbrain/test/helper.h" | |||
#include "megdnn/heuristic_cache.h" | |||
#include "megdnn/oprs/base.h" | |||
#include <atomic> | |||
@@ -2075,10 +2076,12 @@ void test_free_memory_in_weight_preprocess(int record_level, CompNode cn) { | |||
TEST(TestGraph, FreeMemoryInWeightPreprocess) { | |||
test_free_memory_in_weight_preprocess(0, CompNode::load("xpu0")); | |||
megdnn::HeuristicCache::instance().clear(); | |||
} | |||
TEST(TestGraph, RecordFreeMemoryInWeightPreprocess) { | |||
test_free_memory_in_weight_preprocess(1, CompNode::load("cpu0")); | |||
megdnn::HeuristicCache::instance().clear(); | |||
} | |||
namespace { | |||
@@ -2157,6 +2160,7 @@ TEST(TestGraph, FreeMemoryInWeightPreprocessWithValueInfer) { | |||
->cast_final_safe<opr::SharedDeviceTensor>() | |||
.get_dev_tensor() | |||
.empty()); | |||
megdnn::HeuristicCache::instance().clear(); | |||
} | |||
TEST(TestGraph, FreeMemoryInWeightPreprocessWithMultiReader) { | |||
@@ -2200,6 +2204,7 @@ TEST(TestGraph, FreeMemoryInWeightPreprocessWithMultiReader) { | |||
->cast_final_safe<opr::SharedDeviceTensor>() | |||
.get_dev_tensor() | |||
.empty()); | |||
megdnn::HeuristicCache::instance().clear(); | |||
} | |||
TEST(TestGraph, FreeBias) { | |||
@@ -24,6 +24,7 @@ | |||
//! TODO: here has to be know some megdnn::opr when there is produced midout.h | |||
//! fix it if there is another graceful way. | |||
#include "megdnn/heuristic_cache.h" | |||
#include "megdnn/opr_param_defs.h" | |||
#include "megdnn/oprs.h" | |||
#include "megdnn/oprs/base.h" | |||
@@ -1156,6 +1157,15 @@ template <typename Opr> | |||
size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||
Opr* megdnn_opr, const MGBOpr* mgb_opr, | |||
bool allow_weight_preprocess) { | |||
HeuristicCache::Key cache_key( | |||
megdnn_opr->handle(), megdnn_opr->get_opr_type(), layouts.data(), | |||
layouts.size(), &megdnn_opr->param(), sizeof(megdnn_opr->param())); | |||
auto rst = HeuristicCache::instance().get(cache_key); | |||
if (rst.policy.algo.valid()) { | |||
megdnn_opr->execution_policy() = rst.policy; | |||
return rst.workspace; | |||
} | |||
if (WorkspaceLimitGetter::is_prealloc_run(mgb_opr->owner_graph())) { | |||
return 0; | |||
} | |||
@@ -1192,6 +1202,11 @@ size_t AlgoChooser<Opr>::setup_algo(const FixedTensorLayouts& layouts, | |||
mgb_log_debug("%s", ret.c_str()); | |||
megdnn_opr->execution_policy() = policy; | |||
if (mgb_opr->execution_policy().strategy & ExecutionStrategy::HEURISTIC) { | |||
HeuristicCache::Result cache_result{policy, workspace}; | |||
HeuristicCache::instance().put(cache_key, cache_result); | |||
} | |||
return workspace; | |||
} | |||
@@ -22,6 +22,7 @@ | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megdnn/oprs/base.h" | |||
#include "megdnn/dtype.h" | |||
#include "megdnn/heuristic_cache.h" | |||
#include <cmath> | |||
#include <random> | |||
@@ -337,6 +338,7 @@ void test_no_profiling_on_shape_change(const TensorShapeArray& inps0, | |||
TEST(TestOprDNN, FastrunNoProfilingOnShapeChange) { | |||
REQUIRE_GPU(1); | |||
megdnn::HeuristicCache::instance().clear(); | |||
test_no_profiling_on_shape_change<opr::Convolution>( | |||
{{12, 3, 36, 36}, {4, 3, 3, 3}}, {{32, 3, 28, 28}, {4, 3, 3, 3}}); | |||
@@ -21,6 +21,7 @@ | |||
#include "megbrain/gopt/inference.h" | |||
#include "megbrain/opr/tensor_manip.h" | |||
#include "megdnn/dtype.h" | |||
#include "megdnn/heuristic_cache.h" | |||
#include "megdnn/oprs/base.h" | |||
#include <gmock/gmock.h> | |||
@@ -396,6 +397,7 @@ TEST(TestOprDNN, ConvBiasExePolicy) { | |||
#endif | |||
run(strategy); | |||
} | |||
megdnn::HeuristicCache::instance().clear(); | |||
ASSERT_THROW(run(S::OPTIMIZED | S::PROFILE), MegBrainError); | |||
PersistentCache::set_impl(orig_impl); | |||
} | |||
@@ -460,6 +462,7 @@ TEST(TestOprDNN, ConvolutionExePolicy) { | |||
for (auto strategy : | |||
SmallVector<S>{S : HEURISTIC, S::PROFILE | S::HEURISTIC}) { | |||
#endif | |||
megdnn::HeuristicCache::instance().clear(); | |||
using Checker = AutoOprChecker<2, 1>; | |||
auto make_graph = [&](const Checker::SymInpArray& inputs) | |||
@@ -489,6 +492,7 @@ TEST(TestOprDNN, ConvolutionExePolicy) { | |||
} else { | |||
ASSERT_LT(0, nr_get); | |||
} | |||
megdnn::HeuristicCache::instance().clear(); | |||
} | |||
} | |||
@@ -544,6 +548,7 @@ TEST(TestOprDNN, ConvolutionBackwardDataBfloat16ExePolicy) { | |||
#else | |||
for (auto strategy: {S:HEURISTIC, S(S::PROFILE | S::HEURISTIC)}) { | |||
#endif | |||
megdnn::HeuristicCache::instance().clear(); | |||
using Checker = AutoOprChecker<2, 1>; | |||
auto make_graph = [&](const Checker::SymInpArray& inputs) | |||
@@ -1835,6 +1840,7 @@ TEST(TestOprDNN, LocalShareForwardExecPolicy) { | |||
auto run_with_param = [&](size_t fh = 3, size_t fw = 3, size_t sh = 1, | |||
size_t sw = 1, size_t sgh = 3, | |||
size_t sgw = 3) { | |||
megdnn::HeuristicCache::instance().clear(); | |||
size_t ph = fh / 2, pw = fw / 2; | |||
param.pad_h = ph, param.pad_w = pw; | |||
param.stride_h = sh, param.stride_w = sw, | |||
@@ -2289,6 +2295,7 @@ TEST(TestOprDNN, HeuristicReproducible) { | |||
} | |||
algo_name0 = palgo->name(); | |||
} | |||
megdnn::HeuristicCache::instance().clear(); | |||
{ | |||
Checker checker(make_graph, fwd); | |||
checker.run(inp_tensor(2, 3, 4, 9, 8, 3, 3), opt) | |||
@@ -2306,6 +2313,7 @@ TEST(TestOprDNN, HeuristicReproducible) { | |||
algo_name1 = palgo->name(); | |||
} | |||
EXPECT_TRUE(algo_name0 == algo_name1); | |||
megdnn::HeuristicCache::instance().clear(); | |||
} | |||
#undef inp_tensor | |||
#undef get_shp | |||
@@ -2585,6 +2593,7 @@ TEST_F(TestWeightPreprocess, NoPreprocessNeeded) { | |||
} | |||
TEST_F(TestWeightPreprocess, PreprocessCalledOnlyOnce) { | |||
megdnn::HeuristicCache::instance().clear(); | |||
using ::testing::_; | |||
using ::testing::Return; | |||
using ::testing::Field; | |||