refactor(mgb): check input when profiling

GitOrigin-RevId: 1d722dd741
3 years ago · b36b5bd8cb
--- a/imperative/python/megengine/init.py
+++ b/imperative/python/megengine/init.py
@@ -86,7 +86,6 @@ from .core._imperative_rt.core2 import sync as _sync
 from .core._imperative_rt.common import (
    get_supported_sm_versions as _get_supported_sm_versions,
 )
 from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
 from .config import *
 from .device import *
 from .logger import enable_debug_log, get_logger, set_log_file, set_log_level
@@ -118,13 +117,6 @@ def _check_sm_version():

 _check_sm_version()

 _set_fork_exec_path_for_timed_func(
    sys.executable,
    os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),
 )

 del _set_fork_exec_path_for_timed_func

 _exit_handlers = []


--- a/imperative/python/megengine/core/_config.py
+++ b/imperative/python/megengine/core/_config.py
@@ -14,9 +14,11 @@ from ._imperative_rt.core2 import (
 __compute_mode = "default"
 _benchmark_kernel = False
 _deterministic_kernel = False
 _benchmark_with_subprocess = False

 __all__ = [
    "benchmark_kernel",
    "benchmark_with_subprocess",
    "deterministic_kernel",
    "async_level",
    "disable_memory_forwarding",
@@ -72,6 +74,34 @@ def deterministic_kernel(mod, option: bool):


@property
 def benchmark_with_subprocess(mod):
    r"""Whether or not run possible algorithms on real device to find the best one. The default option is false,
    which means use heuristic to choose the fastest algorithm.
    
    Examples:    
        .. code-block::

           import megengine as mge
           mge.config.benchmark_with_subprocess = True
    """
    return _benchmark_with_subprocess


@benchmark_with_subprocess.setter
 def benchmark_with_subprocess(mod, option: bool):
    if option:
        import sys
        from ._imperative_rt.utils import _set_fork_exec_path_for_timed_func

        _set_fork_exec_path_for_timed_func(
            sys.executable,
            os.path.join(
                os.path.dirname(__file__), "../utils", "_timed_func_fork_exec_entry.py"
            ),
        )


@property
 def async_level(mod) -> int:
    r"""Get or set config whether raise error exactly when invoking op. The default level is 2,
    which means both device and user side errors are async.
--- a/src/core/impl/system.cpp
+++ b/src/core/impl/system.cpp
@@ -481,7 +481,7 @@ class TimedFuncInvokerImpl final : public TimedFuncInvoker {
            return iter->second.direct_call(param);

        if (!m_fork_exec_impl) {
            mgb_log_warn(
            mgb_log_debug(
                    "timeout is set, but no fork_exec_impl not given; "
                    "timeout would be ignored");
            return iter->second.direct_call(param);
--- a/src/rdnn/impl/algo_chooser.cpp
+++ b/src/rdnn/impl/algo_chooser.cpp
@@ -595,6 +595,10 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
        auto&& search_items = flatten_search_space<Opr>(*this, circular_deps_checker);
        FOREACH_OPR_TYPE_DISPATCH(search_items, {
            auto&& megdnn_opr = opr::intl::create_megdnn_opr<_Opr>(m_cn);
            // skip different sub opr, for example:
            // skip matmul algo when profiling convolution
            if (m_dnn_opr->get_opr_type() != megdnn_opr->get_opr_type())
                continue;
            megdnn_opr->param() =
                    Algorithm::deserialize_read_pod<typename _Opr::Param>(_item.param);
            typename AlgoChooser<_Opr>::AlgoChooserHelper sub_helper(
@@ -609,7 +613,9 @@ typename AlgoChooser<Opr>::ImplExecutionPolicy AlgoChooser<Opr>::AlgoChooserHelp
    // result, retrive_from_cache = true, allow_log = true
    typename AlgoChooser<Opr>::ImplExecutionPolicy policy;
    construct_execution_policy(selected_strategy, policy);
    return policy;
    if (policy.algo.valid())
        return policy;
    return choose_by_heuristic(selected_strategy);
    MIDOUT_E
 }

@@ -712,7 +718,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::construct_execution_policy(
                            ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
                            Algorithm::attribute_str(target_attr.first).c_str(),
                            Algorithm::attribute_str(target_attr.second).c_str());
                    mgb_log_warn(
                    mgb_log_debug(
                            "No algo get from cache for %s. This may caused by "
                            "mismatch with model and cache file or imcomplete "
                            "cache file. ex. profiling with version1, but "
@@ -876,6 +882,10 @@ Maybe<AlgoChooserProfileCache::ResultEntry> AlgoChooser<Opr>::AlgoChooserHelper:
    if (!rst.valid())
        return None;

    // subprocess will return dbl_max when meomry limit is not satisfied
    if (rst.val().time == std::numeric_limits<double>::max())
        return None;

    std::string algo_desc;
    serialize_write_pod(policy.algo, algo_desc);
    return AlgoChooserProfileCache::ResultEntry{
@@ -893,6 +903,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
    auto&& rst = get_profile_result_from_cache(selected_strategy);
    // rst.first.valid means there exists valid algorithms for current opr, just return
    // otherwise need to profile
    // in order to avoid reprofile in fastrun
    if (rst.first.valid())
        return;
    AlgoChooserProfileCache::Result prof_rst;
@@ -901,6 +912,10 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
    std::string layouts_str = AlgoChooser::format_fixlayouts(m_fastrun_layouts);
    double cur_timeout = 0;

    size_t data_size = 0;
    for (auto ly : m_fastrun_layouts)
        data_size += ly.span().dist_byte();

    auto workspace_limit =
            m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit);
    RealTimer timer;
@@ -925,6 +940,12 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
        ImplExecutionPolicy policy;
        policy.algo = algo.desc;

        // skip naive algo, can not using attribute to determine naive algo, thus using
        // strcmp
        if (algo.desc.name.compare("NAIVE") == 0) {
            continue;
        }

        //! check negative attribute : skip negative attribute
        auto palgo = m_dnn_opr->get_algorithm_from_desc(policy.algo);
        if (palgo->contain_attribute_any(target_attr.second)) {
@@ -938,10 +959,13 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(

        //! check workspace limit
        construct_execution_policy(selected_strategy, policy);
        mgb_assert(
                policy.algo.valid(),
                "construct execution policy must success when profiling");
        if (get_workspace_size_bytes(policy) > workspace_limit) {
        // this will failed
        // when construct matmul algorithm for convolution opr
        if (!policy.algo.valid())
            continue;
        size_t workspace_needed = get_workspace_size_bytes(policy);
        if (data_size + workspace_needed >
            m_desc.get_workspace_limit(m_cn, m_execution_policy.workspace_limit)) {
            continue;
        }

@@ -957,7 +981,7 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
        })
        // megbrain uncatched exception
        MGB_CATCH(..., {
            mgb_log_warn("caught exception during %s", msg.c_str());
            mgb_log_debug("caught exception during %s", msg.c_str());
            continue;
        })
        if (!cur_rst.valid()) {
@@ -982,20 +1006,22 @@ void AlgoChooser<Opr>::AlgoChooserHelper::profile(
            "workspace limite requirement(%zu)",
            ::MegDNNOpr2Typename<Opr>::name, layouts_str.c_str(),
            Algorithm::attribute_str(target_attr.second).c_str(), workspace_limit);
    mgb_assert(!prof_rst.empty(), "%s", msg.c_str());
    // allowed to have empty profile result for current opr

    // append some previous profiled results
    if (rst.second.valid())
        prof_rst.insert(
                prof_rst.end(), rst.second.val().begin(), rst.second.val().end());
    FixedTensorLayouts incache_layouts = m_incache_layouts;
    typename Opr::Param origin_param = m_dnn_opr->param();
    AlgoChooserProfileCache::Key cache_key{
            incache_layouts.data(), incache_layouts.size(), &origin_param,
            sizeof(origin_param)};

    AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
    cache.put(cache_key, prof_rst);
    if (!prof_rst.empty()) {
        FixedTensorLayouts incache_layouts = m_incache_layouts;
        typename Opr::Param origin_param = m_dnn_opr->param();
        AlgoChooserProfileCache::Key cache_key{
                incache_layouts.data(), incache_layouts.size(), &origin_param,
                sizeof(origin_param)};

        AlgoChooserProfileCache cache(m_cn, profile_name(m_dnn_opr).c_str());
        cache.put(cache_key, prof_rst);
    }
    MIDOUT_E
 }

--- a/src/rdnn/impl/profiler.cpp
+++ b/src/rdnn/impl/profiler.cpp
@@ -245,21 +245,34 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        }
    });

    {
        // first allocate a whole chunk to avoid memory fragmentation (here we
        // rely on memory allocator to reuse memory)
        auto align = cn.get_mem_addr_alignment();
        size_t tot_size = align;
        for (int i = 0; i < arity; ++i) {
            tot_size += layouts[i].span().high_byte + align;
        }
        for (const auto& layout : preprocessed_layout) {
            tot_size += layout.span().high_byte + align;
        }
        tot_size += param.workspace;
        DeviceTensorStorage storage{cn};
        storage.ensure_size(tot_size);
    megdnn::Algorithm* algo =
            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
    mgb_assert(algo);

 #if !MGB_BUILD_SLIM_SERVING
 #if MGB_CUDA || MGB_ROCM
    // if tot_size > workspace_limit, then skip current algo, return double_max
    // this assertion is needed because when profiling algo with subprocess,
    // child process would occupy some cuda memory for initialization
    // this assertion is the most accurate than before
    size_t workspace_limit =
            std::max(cn.get_free_mem(), cn.get_max_block_size_available());
    auto align = cn.get_mem_addr_alignment();
    size_t tot_size = align;
    for (int i = 0; i < arity; ++i) {
        tot_size += layouts[i].span().high_byte + align;
    }
    for (const auto& layout : preprocessed_layout) {
        tot_size += layout.span().high_byte + align;
    }
    tot_size += param.workspace;
    if (tot_size > workspace_limit) {
        mgb_log_debug(
                "current memory is not enouugh when profiling algo %s\n", algo->name());
        return TResult::from_pod(Result{std::numeric_limits<double>::max()});
    }
 #endif
 #endif

    // allocate input and output memory
    std::array<DeviceTensorND, arity_in> inp_val;
@@ -334,20 +347,17 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
            });
    ev_end->record();

    megdnn::Algorithm* algo =
            megdnn_opr->get_algorithm_from_desc(megdnn_opr->execution_policy().algo);
    mgb_assert(algo);
    double next_report_time = 0.5;
    while (!ev_end->finished()) {
        if (timer.get_secs() >= next_report_time) {
 #if MGB_ENABLE_GETENV
            mgb_log_debug(
                    "profiling conv algo %s already took %.3f/%.3f secs"
                    "profiling algo %s already took %.3f/%.3f secs"
                    " (limit can be set by MGB_CONV_PROFILING_TIMEOUT) ",
                    algo->name(), timer.get_secs(), param.actual_timeout);
 #else
            mgb_log_debug(
                    "profiling conv algo %s already took %.3f/%.3f secs", algo->name(),
                    "profiling algo %s already took %.3f/%.3f secs", algo->name(),
                    timer.get_secs(), param.actual_timeout);
 #endif
            next_report_time = timer.get_secs() + 1;
@@ -357,6 +367,19 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
        std::this_thread::sleep_for(1000us);
 #endif
    }

    DeviceTensorStorage storage;
    for (int i = 0; i < arity_in; ++i) {
        inp_val[i].reset(storage, TensorLayout{});
    }
    for (int i = 0; i < arity_out; ++i) {
        out_val[i].reset(storage, TensorLayout{});
    }
    for (size_t i = 0; i < preprocessed_layout.size(); i++) {
        flt_val[i].reset(storage, TensorLayout{});
    }
    mdn_workspace = megdnn::Workspace{};
    workspace.reset(storage, TensorLayout{});
    // release all free blocks owned by child process,
    // in order to avoid main process running out of memory
    cn.try_coalesce_all_free_memory();