@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync | |||
from .core._imperative_rt.common import ( | |||
get_supported_sm_versions as _get_supported_sm_versions, | |||
) | |||
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | |||
from .config import * | |||
from .device import * | |||
from .logger import enable_debug_log, get_logger, set_log_file, set_log_level | |||
@@ -118,13 +117,6 @@ def _check_sm_version(): | |||
_check_sm_version() | |||
_set_fork_exec_path_for_timed_func( | |||
sys.executable, | |||
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), | |||
) | |||
del _set_fork_exec_path_for_timed_func | |||
_exit_handlers = [] | |||
@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import ( | |||
__compute_mode = "default" | |||
_benchmark_kernel = False | |||
_deterministic_kernel = False | |||
_benchmark_with_subprocess = False | |||
__all__ = [ | |||
"benchmark_kernel", | |||
"benchmark_with_subprocess", | |||
"deterministic_kernel", | |||
"async_level", | |||
"disable_memory_forwarding", | |||
@@ -72,6 +74,34 @@ def deterministic_kernel(mod, option: bool): | |||
@property | |||
def benchmark_with_subprocess(mod): | |||
r"""Whether or not run possible algorithms on real device to find the best one. The default option is false, | |||
which means use heuristic to choose the fastest algorithm. | |||
Examples: | |||
.. code-block:: | |||
import megengine as mge | |||
mge.config.benchmark_with_subprocess = True | |||
""" | |||
return _benchmark_with_subprocess | |||
@benchmark_with_subprocess.setter | |||
def benchmark_with_subprocess(mod, option: bool): | |||
if option: | |||
import sys | |||
from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func | |||
_set_fork_exec_path_for_timed_func( | |||
sys.executable, | |||
os.path.join( | |||
os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py" | |||
), | |||
) | |||
@property | |||
def async_level(mod) -> int: | |||
r"""Get or set config whether raise error exactly when invoking op. The default level is 2, | |||
which means both device and user side errors are async. | |||
@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker { | |||
return iter->second.direct_call(param); | |||
if (!m_fork_exec_impl) { | |||
mgb_log_warn( | |||
mgb_log_debug( | |||
"timeout is set, but no fork_exec_impl not given; " | |||
"timeout would be ignored"); | |||
return iter->second.direct_call(param); | |||
@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp | |||
auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker); | |||
FOREACH_OPR_TYPE_DISPATCH(search_items, { | |||
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); | |||
// skip different sub opr, for example: | |||
// skip matmul algo when profiling convolution | |||
if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type()) | |||
continue; | |||
megdnn_opr->param() = | |||
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param); | |||
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( | |||
@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp | |||
// result, retrive_from_cache = true, allow_log = true | |||
typename AlgoChooser<Opr>::ImplExecutionPolicy policy; | |||
construct_execution_policy(selected_strategy, policy); | |||
return policy; | |||
if (policy.algo.valid()) | |||
return policy; | |||
return choose_by_heuristic(selected_strategy); | |||
MIDOUT_E | |||
} | |||
@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy( | |||
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), | |||
Algorithm::attribute_str(target_attr.first).c_str(), | |||
Algorithm::attribute_str(target_attr.second).c_str()); | |||
mgb_log_warn( | |||
mgb_log_debug( | |||
"No algo get from cache for %s. This may caused by " | |||
"mismatch with model and cache file or imcomplete " | |||
"cache file. ex. profiling with version1, but " | |||
@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper: | |||
if (!rst.valid()) | |||
return None; | |||
// subprocess will return dbl_max when meomry limit is not satisfied | |||
if (rst.val().time == std::numeric_limits<double>::max()) | |||
return None; | |||
std::string algo_desc; | |||
serialize_write_pod(policy.algo, algo_desc); | |||
return AlgoChooserProfileCache::ResultEntry{ | |||
@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
auto&& rst = get_profile_result_from_cache(selected_strategy); | |||
// rst.first.valid means there exists valid algorithms for current opr, just return | |||
// otherwise need to profile | |||
// in order to avoid reprofile in fastrun | |||
if (rst.first.valid()) | |||
return; | |||
AlgoChooserProfileCache::Result prof_rst; | |||
@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); | |||
double cur_timeout = 0; | |||
size_t data_size = 0; | |||
for (auto ly : m_fastrun_layouts) | |||
data_size += ly.span().dist_byte(); | |||
auto workspace_limit = | |||
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); | |||
RealTimer timer; | |||
@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
ImplExecutionPolicy policy; | |||
policy.algo = algo.desc; | |||
// skip naive algo, can not using attribute to determine naive algo, thus using | |||
// strcmp | |||
if (algo.desc.name.compare("NAIVE") == 0) { | |||
continue; | |||
} | |||
//! check negative attribute : skip negative attribute | |||
auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); | |||
if (palgo->contain_attribute_any(target_attr.second)) { | |||
@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
//! check workspace limit | |||
construct_execution_policy(selected_strategy, policy); | |||
mgb_assert( | |||
policy.algo.valid(), | |||
"construct execution policy must success when profiling"); | |||
if (get_workspace_size_bytes(policy) > workspace_limit) { | |||
// this will failed | |||
// when construct matmul algorithm for convolution opr | |||
if (!policy.algo.valid()) | |||
continue; | |||
size_t workspace_needed = get_workspace_size_bytes(policy); | |||
if (data_size + workspace_needed > | |||
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) { | |||
continue; | |||
} | |||
@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
}) | |||
// megbrain uncatched exception | |||
MGB_CATCH(..., { | |||
mgb_log_warn("caught exception during %s", msg.c_str()); | |||
mgb_log_debug("caught exception during %s", msg.c_str()); | |||
continue; | |||
}) | |||
if (!cur_rst.valid()) { | |||
@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile( | |||
"workspace limite requirement(%zu)", | |||
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), | |||
Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); | |||
mgb_assert(!prof_rst.empty(), "%s", msg.c_str()); | |||
// allowed to have empty profile result for current opr | |||
// append some previous profiled results | |||
if (rst.second.valid()) | |||
prof_rst.insert( | |||
prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); | |||
FixedTensorLayouts incache_layouts = m_incache_layouts; | |||
typename Opr::Param origin_param = m_dnn_opr->param(); | |||
AlgoChooserProfileCache::Key cache_key{ | |||
incache_layouts.data(), incache_layouts.size(), &origin_param, | |||
sizeof(origin_param)}; | |||
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | |||
cache.put(cache_key, prof_rst); | |||
if (!prof_rst.empty()) { | |||
FixedTensorLayouts incache_layouts = m_incache_layouts; | |||
typename Opr::Param origin_param = m_dnn_opr->param(); | |||
AlgoChooserProfileCache::Key cache_key{ | |||
incache_layouts.data(), incache_layouts.size(), &origin_param, | |||
sizeof(origin_param)}; | |||
AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str()); | |||
cache.put(cache_key, prof_rst); | |||
} | |||
MIDOUT_E | |||
} | |||
@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
} | |||
}); | |||
{ | |||
// first allocate a whole chunk to avoid memory fragmentation (here we | |||
// rely on memory allocator to reuse memory) | |||
auto align = cn.get_mem_addr_alignment(); | |||
size_t tot_size = align; | |||
for (int i = 0; i < arity; ++i) { | |||
tot_size += layouts[i].span().high_byte + align; | |||
} | |||
for (const auto& layout : preprocessed_layout) { | |||
tot_size += layout.span().high_byte + align; | |||
} | |||
tot_size += param.workspace; | |||
DeviceTensorStorage storage{cn}; | |||
storage.ensure_size(tot_size); | |||
megdnn::Algorithm* algo = | |||
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); | |||
mgb_assert(algo); | |||
#if !MGB_BUILD_SLIM_SERVING | |||
#if MGB_CUDA || MGB_ROCM | |||
// if tot_size > workspace_limit, then skip current algo, return double_max | |||
// this assertion is needed because when profiling algo with subprocess, | |||
// child process would occupy some cuda memory for initialization | |||
// this assertion is the most accurate than before | |||
size_t workspace_limit = | |||
std::max(cn.get_free_mem(), cn.get_max_block_size_available()); | |||
auto align = cn.get_mem_addr_alignment(); | |||
size_t tot_size = align; | |||
for (int i = 0; i < arity; ++i) { | |||
tot_size += layouts[i].span().high_byte + align; | |||
} | |||
for (const auto& layout : preprocessed_layout) { | |||
tot_size += layout.span().high_byte + align; | |||
} | |||
tot_size += param.workspace; | |||
if (tot_size > workspace_limit) { | |||
mgb_log_debug( | |||
"current memory is not enouugh when profiling algo %s\n", algo->name()); | |||
return TResult::from_pod(Result{std::numeric_limits<double>::max()}); | |||
} | |||
#endif | |||
#endif | |||
// allocate input and output memory | |||
std::array<DeviceTensorND, arity_in> inp_val; | |||
@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
}); | |||
ev_end->record(); | |||
megdnn::Algorithm* algo = | |||
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo); | |||
mgb_assert(algo); | |||
double next_report_time = 0.5; | |||
while (!ev_end->finished()) { | |||
if (timer.get_secs() >= next_report_time) { | |||
#if MGB_ENABLE_GETENV | |||
mgb_log_debug( | |||
"profiling conv algo %s already took %.3f/%.3f secs" | |||
"profiling algo %s already took %.3f/%.3f secs" | |||
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", | |||
algo->name(), timer.get_secs(), param.actual_timeout); | |||
#else | |||
mgb_log_debug( | |||
"profiling conv algo %s already took %.3f/%.3f secs", algo->name(), | |||
"profiling algo %s already took %.3f/%.3f secs", algo->name(), | |||
timer.get_secs(), param.actual_timeout); | |||
#endif | |||
next_report_time = timer.get_secs() + 1; | |||
@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl( | |||
std::this_thread::sleep_for(1000us); | |||
#endif | |||
} | |||
DeviceTensorStorage storage; | |||
for (int i = 0; i < arity_in; ++i) { | |||
inp_val[i].reset(storage, TensorLayout{}); | |||
} | |||
for (int i = 0; i < arity_out; ++i) { | |||
out_val[i].reset(storage, TensorLayout{}); | |||
} | |||
for (size_t i = 0; i < preprocessed_layout.size(); i++) { | |||
flt_val[i].reset(storage, TensorLayout{}); | |||
} | |||
mdn_workspace = megdnn::Workspace{}; | |||
workspace.reset(storage, TensorLayout{}); | |||
// release all free blocks owned by child process, | |||
// in order to avoid main process running out of memory | |||
cn.try_coalesce_all_free_memory(); | |||