Browse Source

refactor(mgb): check input when profiling

GitOrigin-RevId: 1d722dd741
release-1.10
Megvii Engine Team 3 years ago
parent
commit
b36b5bd8cb
5 changed files with 115 additions and 44 deletions
  1. +0
    -8
      imperative/python/megengine/__init__.py
  2. +30
    -0
      imperative/python/megengine/core/_config.py
  3. +1
    -1
      src/core/impl/system.cpp
  4. +42
    -16
      src/rdnn/impl/algo_chooser.cpp
  5. +42
    -19
      src/rdnn/impl/profiler.cpp

+ 0
- 8
imperative/python/megengine/__init__.py View File

@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
from .core._imperative_rt.common import ( from .core._imperative_rt.common import (
get_supported_sm_versions as _get_supported_sm_versions, get_supported_sm_versions as _get_supported_sm_versions,
) )
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
from .config import * from .config import *
from .device import * from .device import *
from .logger import enable_debug_log, get_logger, set_log_file, set_log_level from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
@@ -118,13 +117,6 @@ def _check_sm_version():


_check_sm_version() _check_sm_version()


_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
)

del _set_fork_exec_path_for_timed_func

_exit_handlers = [] _exit_handlers = []






+ 30
- 0
imperative/python/megengine/core/_config.py View File

@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
__compute_mode = "default" __compute_mode = "default"
_benchmark_kernel = False _benchmark_kernel = False
_deterministic_kernel = False _deterministic_kernel = False
_benchmark_with_subprocess = False


__all__ = [ __all__ = [
"benchmark_kernel", "benchmark_kernel",
"benchmark_with_subprocess",
"deterministic_kernel", "deterministic_kernel",
"async_level", "async_level",
"disable_memory_forwarding", "disable_memory_forwarding",
@@ -72,6 +74,34 @@ def deterministic_kernel(mod, option: bool):




@property @property
def benchmark_with_subprocess(mod):
r"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
which means use heuristic to choose the fastest algorithm.
Examples:
.. code-block::

import megengine as mge
mge.config.benchmark_with_subprocess = True
"""
return _benchmark_with_subprocess


@benchmark_with_subprocess.setter
def benchmark_with_subprocess(mod, option: bool):
if option:
import sys
from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func

_set_fork_exec_path_for_timed_func(
sys.executable,
os.path.join(
os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py"
),
)


@property
def async_level(mod) -> int: def async_level(mod) -> int:
r"""Get or set config whether raise error exactly when invoking op. The default level is 2, r"""Get or set config whether raise error exactly when invoking op. The default level is 2,
which means both device and user side errors are async. which means both device and user side errors are async.


+ 1
- 1
src/core/impl/system.cpp View File

@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
return iter->second.direct_call(param); return iter->second.direct_call(param);


if (!m_fork_exec_impl) { if (!m_fork_exec_impl) {
mgb_log_warn(
mgb_log_debug(
"timeout is set, but no fork_exec_impl not given; " "timeout is set, but no fork_exec_impl not given; "
"timeout would be ignored"); "timeout would be ignored");
return iter->second.direct_call(param); return iter->second.direct_call(param);


+ 42
- 16
src/rdnn/impl/algo_chooser.cpp View File

@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker); auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
FOREACH_OPR_TYPE_DISPATCH(search_items, { FOREACH_OPR_TYPE_DISPATCH(search_items, {
auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn); auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
// skip different sub opr, for example:
// skip matmul algo when profiling convolution
if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type())
continue;
megdnn_opr->param() = megdnn_opr->param() =
Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param); Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper( typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
// result, retrive_from_cache = true, allow_log = true // result, retrive_from_cache = true, allow_log = true
typename AlgoChooser<Opr>::ImplExecutionPolicy policy; typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
construct_execution_policy(selected_strategy, policy); construct_execution_policy(selected_strategy, policy);
return policy;
if (policy.algo.valid())
return policy;
return choose_by_heuristic(selected_strategy);
MIDOUT_E MIDOUT_E
} }


@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.first).c_str(), Algorithm::attribute_str(target_attr.first).c_str(),
Algorithm::attribute_str(target_attr.second).c_str()); Algorithm::attribute_str(target_attr.second).c_str());
mgb_log_warn(
mgb_log_debug(
"No algo get from cache for %s. This may caused by " "No algo get from cache for %s. This may caused by "
"mismatch with model and cache file or imcomplete " "mismatch with model and cache file or imcomplete "
"cache file. ex. profiling with version1, but " "cache file. ex. profiling with version1, but "
@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
if (!rst.valid()) if (!rst.valid())
return None; return None;


// subprocess will return dbl_max when meomry limit is not satisfied
if (rst.val().time == std::numeric_limits<double>::max())
return None;

std::string algo_desc; std::string algo_desc;
serialize_write_pod(policy.algo, algo_desc); serialize_write_pod(policy.algo, algo_desc);
return AlgoChooserProfileCache::ResultEntry{ return AlgoChooserProfileCache::ResultEntry{
@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
auto&& rst = get_profile_result_from_cache(selected_strategy); auto&& rst = get_profile_result_from_cache(selected_strategy);
// rst.first.valid means there exists valid algorithms for current opr, just return // rst.first.valid means there exists valid algorithms for current opr, just return
// otherwise need to profile // otherwise need to profile
// in order to avoid reprofile in fastrun
if (rst.first.valid()) if (rst.first.valid())
return; return;
AlgoChooserProfileCache::Result prof_rst; AlgoChooserProfileCache::Result prof_rst;
@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts); std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
double cur_timeout = 0; double cur_timeout = 0;


size_t data_size = 0;
for (auto ly : m_fastrun_layouts)
data_size += ly.span().dist_byte();

auto workspace_limit = auto workspace_limit =
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit); m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
RealTimer timer; RealTimer timer;
@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
ImplExecutionPolicy policy; ImplExecutionPolicy policy;
policy.algo = algo.desc; policy.algo = algo.desc;


// skip naive algo, can not using attribute to determine naive algo, thus using
// strcmp
if (algo.desc.name.compare("NAIVE") == 0) {
continue;
}

//! check negative attribute : skip negative attribute //! check negative attribute : skip negative attribute
auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo); auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
if (palgo->contain_attribute_any(target_attr.second)) { if (palgo->contain_attribute_any(target_attr.second)) {
@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(


//! check workspace limit //! check workspace limit
construct_execution_policy(selected_strategy, policy); construct_execution_policy(selected_strategy, policy);
mgb_assert(
policy.algo.valid(),
"construct execution policy must success when profiling");
if (get_workspace_size_bytes(policy) > workspace_limit) {
// this will failed
// when construct matmul algorithm for convolution opr
if (!policy.algo.valid())
continue;
size_t workspace_needed = get_workspace_size_bytes(policy);
if (data_size + workspace_needed >
m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
continue; continue;
} }


@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
}) })
// megbrain uncatched exception // megbrain uncatched exception
MGB_CATCH(..., { MGB_CATCH(..., {
mgb_log_warn("caught exception during %s", msg.c_str());
mgb_log_debug("caught exception during %s", msg.c_str());
continue; continue;
}) })
if (!cur_rst.valid()) { if (!cur_rst.valid()) {
@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
"workspace limite requirement(%zu)", "workspace limite requirement(%zu)",
::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(), ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit); Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
mgb_assert(!prof_rst.empty(), "%s", msg.c_str());
// allowed to have empty profile result for current opr


// append some previous profiled results // append some previous profiled results
if (rst.second.valid()) if (rst.second.valid())
prof_rst.insert( prof_rst.insert(
prof_rst.end(), rst.second.val().begin(), rst.second.val().end()); prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
FixedTensorLayouts incache_layouts = m_incache_layouts;
typename Opr::Param origin_param = m_dnn_opr->param();
AlgoChooserProfileCache::Key cache_key{
incache_layouts.data(), incache_layouts.size(), &origin_param,
sizeof(origin_param)};

AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst);
if (!prof_rst.empty()) {
FixedTensorLayouts incache_layouts = m_incache_layouts;
typename Opr::Param origin_param = m_dnn_opr->param();
AlgoChooserProfileCache::Key cache_key{
incache_layouts.data(), incache_layouts.size(), &origin_param,
sizeof(origin_param)};

AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
cache.put(cache_key, prof_rst);
}
MIDOUT_E MIDOUT_E
} }




+ 42
- 19
src/rdnn/impl/profiler.cpp View File

@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
} }
}); });


{
// first allocate a whole chunk to avoid memory fragmentation (here we
// rely on memory allocator to reuse memory)
auto align = cn.get_mem_addr_alignment();
size_t tot_size = align;
for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align;
}
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace;
DeviceTensorStorage storage{cn};
storage.ensure_size(tot_size);
megdnn::Algorithm* algo =
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
mgb_assert(algo);

#if !MGB_BUILD_SLIM_SERVING
#if MGB_CUDA || MGB_ROCM
// if tot_size > workspace_limit, then skip current algo, return double_max
// this assertion is needed because when profiling algo with subprocess,
// child process would occupy some cuda memory for initialization
// this assertion is the most accurate than before
size_t workspace_limit =
std::max(cn.get_free_mem(), cn.get_max_block_size_available());
auto align = cn.get_mem_addr_alignment();
size_t tot_size = align;
for (int i = 0; i < arity; ++i) {
tot_size += layouts[i].span().high_byte + align;
}
for (const auto& layout : preprocessed_layout) {
tot_size += layout.span().high_byte + align;
}
tot_size += param.workspace;
if (tot_size > workspace_limit) {
mgb_log_debug(
"current memory is not enouugh when profiling algo %s\n", algo->name());
return TResult::from_pod(Result{std::numeric_limits<double>::max()});
} }
#endif
#endif


// allocate input and output memory // allocate input and output memory
std::array<DeviceTensorND, arity_in> inp_val; std::array<DeviceTensorND, arity_in> inp_val;
@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
}); });
ev_end->record(); ev_end->record();


megdnn::Algorithm* algo =
megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
mgb_assert(algo);
double next_report_time = 0.5; double next_report_time = 0.5;
while (!ev_end->finished()) { while (!ev_end->finished()) {
if (timer.get_secs() >= next_report_time) { if (timer.get_secs() >= next_report_time) {
#if MGB_ENABLE_GETENV #if MGB_ENABLE_GETENV
mgb_log_debug( mgb_log_debug(
"profiling conv algo %s already took %.3f/%.3f secs"
"profiling algo %s already took %.3f/%.3f secs"
" (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ", " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
algo->name(), timer.get_secs(), param.actual_timeout); algo->name(), timer.get_secs(), param.actual_timeout);
#else #else
mgb_log_debug( mgb_log_debug(
"profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
"profiling algo %s already took %.3f/%.3f secs", algo->name(),
timer.get_secs(), param.actual_timeout); timer.get_secs(), param.actual_timeout);
#endif #endif
next_report_time = timer.get_secs() + 1; next_report_time = timer.get_secs() + 1;
@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
std::this_thread::sleep_for(1000us); std::this_thread::sleep_for(1000us);
#endif #endif
} }

DeviceTensorStorage storage;
for (int i = 0; i < arity_in; ++i) {
inp_val[i].reset(storage, TensorLayout{});
}
for (int i = 0; i < arity_out; ++i) {
out_val[i].reset(storage, TensorLayout{});
}
for (size_t i = 0; i < preprocessed_layout.size(); i++) {
flt_val[i].reset(storage, TensorLayout{});
}
mdn_workspace = megdnn::Workspace{};
workspace.reset(storage, TensorLayout{});
// release all free blocks owned by child process, // release all free blocks owned by child process,
// in order to avoid main process running out of memory // in order to avoid main process running out of memory
cn.try_coalesce_all_free_memory(); cn.try_coalesce_all_free_memory();


Loading…
Cancel
Save