feat(interpreter): add dynamic sublinear

GitOrigin-RevId: 7de54fe743
4 years ago · fe99cdc794
--- a/imperative/python/megengine/autodiff/grad_manager.py
+++ b/imperative/python/megengine/autodiff/grad_manager.py
@@ -3,7 +3,7 @@ from collections import defaultdict
 from contextlib import contextmanager
 from typing import Callable
 from ..core._imperative_rt.core2 import pop_scope, push_scope
 from ..core._imperative_rt.core2 import pop_scope, push_scope, set_option
 from ..core.autodiff.grad import Grad
 from ..logger import get_logger
 from ..tensor import Tensor
@@ -241,6 +241,7 @@ class GradManager:
        :param dy: tensor or list of tensors. Defaults to 1 if y is scalar
        """
        push_scope("backward")
        set_option("record_computing_path", 0)
        from ..functional import ones_like
        global backwarding_grad_manager
@@ -284,6 +285,7 @@ class GradManager:
        finally:
            self.release()
            backwarding_grad_manager = cache
        set_option("record_computing_path", 1)
        pop_scope("backward")
    def record(self):
--- a/imperative/python/megengine/optimizer/optimizer.py
+++ b/imperative/python/megengine/optimizer/optimizer.py
@@ -15,7 +15,7 @@ from typing import Union
 import numpy as np
 from ..core._imperative_rt.core2 import pop_scope, push_scope
 from ..core._imperative_rt.core2 import pop_scope, push_scope, set_option
 from ..core.tensor.utils import set_convert_inputs
 from ..tensor import Parameter, Tensor
 from ..utils.deprecation import deprecated
@@ -148,6 +148,7 @@ class Optimizer(metaclass=ABCMeta):
        """
        # set the globle state `_enable_convert_inputs` to `False` to disable
        # the `convert_inputs` for param updates
        set_option("record_computing_path", 0)
        backup = set_convert_inputs(False)
        for group in self.param_groups:
            if isinstance(group["params"], set):
@@ -161,6 +162,7 @@ class Optimizer(metaclass=ABCMeta):
            pop_scope("step")
        # restore the globle state `_enable_convert_inputs`
        set_convert_inputs(backup)
        set_option("record_computing_path", 1)
        return self
    @deprecated(version="1.0", reason="use clear_grad instead")
--- a/imperative/python/megengine/utils/dtr.py
+++ b/imperative/python/megengine/utils/dtr.py
@@ -0,0 +1,44 @@
 # -*- coding: utf-8 -*-
 # MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
 #
 # Copyright (c) 2014-2021 Megvii Inc. All rights reserved.
 #
 # Unless required by applicable law or agreed to in writing,
 # software distributed under the License is distributed on an
 # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 from ..core._imperative_rt.core2 import set_option
 from ..core._imperative_rt.utils import _set_defrag
 class DTR:
    r"""
    DTR implements `Dynamic Tensor Rematerialization <https://arxiv.org/abs/2006.09616>`_ in MegEngine.
    It is basically an online algorithm for checkpointing driven by certain eviction policies.
    .. code-block::
        from megengine.utils.dtr import DTR
        ds = DTR(memory_budget=5*1024**3)
        # your training code
    """
    def __init__(self, memory_budget=0, tensor_lowerbound=1048576):
        r"""
        :param memory_budget: int. The threshold of memory usage. When memory
        usage exceeds this value, auto evict will be triggered.
        :param tensor_lowerbound: int. The minimum memory limit of the tensor
        that can be evicted. Default: 1MB.
        """
        if memory_budget > 0:
            set_option("enable_auto_drop", 1)
            set_option("enable_drop", 1)
            set_option("buffer_length", 0)
            set_option("memory_budget", memory_budget)
            set_option("tensor_lowerbound", tensor_lowerbound)
            set_option("record_computing_path", 1)
            _set_defrag(True)
--- a/imperative/python/src/tensor.cpp
+++ b/imperative/python/src/tensor.cpp
@@ -901,7 +901,7 @@ void init_tensor(py::module m) {
    }
    m.def("set_option",
          [](std::string name, int value){ interpreter_for_py->set_option(name, value); });
          [](std::string name, size_t value){ interpreter_for_py->set_option(name, value); });
    m.def("get_option",
          [](std::string name){ return interpreter_for_py->get_option(name); });
    m.def("_set_swap_flag",
--- a/imperative/python/test/integration/test_converge_with_swap_and_drop.py
+++ b/imperative/python/test/integration/test_converge_with_swap_and_drop.py
@@ -15,7 +15,12 @@ import megengine as mge
 import megengine.autodiff as ad
 import megengine.functional as F
 from megengine import Tensor
 from megengine.core._imperative_rt.core2 import _set_drop_flag, _set_swap_flag
 from megengine.core._imperative_rt.core2 import (
    _set_drop_flag,
    _set_swap_flag,
    get_option,
    set_option,
 )
 from megengine.module import Linear, Module
 from megengine.optimizer import SGD
@@ -79,7 +84,8 @@ class XORNet(Module):
 def test_training_converge_with_swap_and_drop():
    _set_swap_flag(True)
    _set_drop_flag(True)
    old_buffer_length = get_option("buffer_length")
    set_option("buffer_length", 0)
    net = XORNet()
    opt = SGD(net.parameters(), lr=0.01, momentum=0.9, weight_decay=5e-4)
    gm = ad.GradManager().attach(net.parameters())
@@ -122,3 +128,4 @@ def test_training_converge_with_swap_and_drop():
    _set_swap_flag(False)
    _set_drop_flag(False)
    set_option("buffer_length", old_buffer_length)
--- a/imperative/src/impl/interpreter/commands.h
+++ b/imperative/src/impl/interpreter/commands.h
@@ -128,7 +128,7 @@ struct Drop {
 struct SetOption {
    std::string key;
    int value;
    size_t value;
    template <typename TFunctor>
    void get_props(TFunctor&& functor) const {
--- a/imperative/src/impl/interpreter/interpreter_impl.cpp
+++ b/imperative/src/impl/interpreter/interpreter_impl.cpp
@@ -13,12 +13,11 @@
 #include "megbrain/common.h"
 #include "megbrain/imperative/opr_utility.h"
 #include "megbrain/imperative/ops/backward_graph.h"
 #include "megbrain/imperative/ops/autogen.h"
 #include "megbrain/imperative/ops/backward_graph.h"
 #include "megbrain/imperative/ops/opr_attr.h"
 #include "megbrain/imperative/utils/to_string.h"
 #include "../op_trait.h"
 using namespace mgb;
 using namespace imperative;
 using namespace interpreter;
@@ -61,8 +60,6 @@ Handle ChannelImpl::put(const DeviceTensorND& data) {
 void ChannelImpl::del(Handle handle) {
    mgb_assert(m_valid_handle.count(handle), "invalid handle: %p", handle);
    auto* info = reinterpret_cast<TensorInfo*>(handle);
    detach_users(info);
    info->detach_producer();
    m_valid_handle.erase(handle);
    m_buffer.enqueue(Del{info});
 }
@@ -73,7 +70,6 @@ void ChannelImpl::swap_in(Handle handle) {
                "invalid handle: %p", handle);
        auto* info = reinterpret_cast<TensorInfo*>(handle);
        m_buffer.enqueue(SwapIn{info});
        info->evict_type = NONE;
    }
 }
@@ -83,7 +79,6 @@ void ChannelImpl::swap_out(Handle handle) {
                "invalid handle: %p", handle);
        auto* info = reinterpret_cast<TensorInfo*>(handle);
        m_buffer.enqueue(SwapOut{info});
        info->evict_type = SWAP;
    }
 }
@@ -92,11 +87,6 @@ void ChannelImpl::drop(Handle handle) {
        mgb_assert(m_valid_handle.find(handle) != m_valid_handle.end(),
                "invalid handle: %p", handle);
        auto* info = reinterpret_cast<TensorInfo*>(handle);
        if (!info->producer) {
            mgb_log_warn("the input that produced tensor %p has been deleted, this drop operation will be ignored", info);
            return;
        }
        info->evict_type = DROP;
        m_buffer.enqueue(Drop{info});
    }
 }
@@ -167,10 +157,6 @@ void ChannelImpl::dispatch_default_cpu(
        outputs->push_back(info);
    }
    if (m_channel_state.options.enable_drop) {
        TensorInfo::ComputePath::make(op, input_infos, output_infos);
    }
    event_data.outputs = tinfo_to_tid(output_infos);
    if (m_channel_state.profiler->is_profiling()) {
        m_channel_state.profiler->record_host<HostOpFinishEvent>(event_data);
@@ -199,9 +185,6 @@ void ChannelImpl::dispatch_kernel(
        cmd.outputs.push_back(info);
        outputs->push_back(info);
    }
    if (m_channel_state.options.enable_drop) {
        TensorInfo::ComputePath::make(cmd.op, cmd.inputs, cmd.outputs);
    }
    m_buffer.enqueue(std::move(cmd));
    if (!validated && m_channel_state.options.async_level == 1) {
        sync();
@@ -233,7 +216,6 @@ SmallVector<Handle> ChannelImpl::apply_op(
            mgb_assert(!info->invalid, "Invalid tensor, unable to apply_op!");
            input_infos.push_back(info);
            input_descs.push_back(info->desc);
            regenerate(info);
        }
    }
@@ -269,7 +251,6 @@ HostTensorND ChannelImpl::get_value(Handle handle) {
    };
    if (!value_fetched()) {
        m_waitee = info;
        regenerate(info);
        m_buffer.enqueue(GetValue{info});
        if (m_channel_state.profiler->is_profiling()) {
            m_channel_state.profiler->record_host<TensorWaitPropEvent>(info->id, TensorInfo::HostValue);
@@ -345,7 +326,6 @@ DeviceTensorND ChannelImpl::get_dev_tensor(Handle handle) {
    std::unique_lock<decltype(m_mutex)> lock(m_mutex);
    mgb_assert(!m_waitee);
    m_waitee = info;
    regenerate(info);
    m_buffer.flush();
    if (m_channel_state.profiler->is_profiling()) {
        m_channel_state.profiler->record_host<TensorWaitPropEvent>(info->id, TensorInfo::DevValue);
@@ -379,11 +359,11 @@ void ChannelImpl::close() {
    sync();
 }
 int ChannelImpl::get_option(std::string name) {
 size_t ChannelImpl::get_option(std::string name) {
    return m_channel_state.options.get_option(name);
 }
 void ChannelImpl::set_option(std::string name, int value) {
 void ChannelImpl::set_option(std::string name, size_t value) {
    m_channel_state.options.set_option(name, value);
    m_buffer.enqueue(SetOption{name, value});
 }
@@ -399,11 +379,64 @@ TensorInfo* ChannelImpl::alloc() {
    return info;
 }
 void ChannelImpl::do_drop(TensorInfo* ptr, bool user=false) {
    if (!ptr->producer) {
        if (user) {
            mgb_log_warn("the input that produced tensor %p has been deleted, this drop operation will be ignored", ptr);
        }
        return;
    }
    if (ptr->evict_type != EvictType::NONE) {
        return;
    }
    ptr->evict_type = EvictType::DROP;
    release_tensor(ptr);
 }
 void ChannelImpl::free(TensorInfo* ptr) {
    if (m_worker_state.options.enable_auto_drop) {
        // Evicting a tensor, rather than freeing it, can avoid pinning
        // potentially exploding amounts of memory and allow us to save
        // more memory.
        ptr->allow_delete = true;
        if (!ptr->ref_cnt) {
            recursive_free(ptr);
        } else {
            do_drop(ptr);
        }
    } else {
        real_free(ptr);
    }
 }
 void ChannelImpl::recursive_free(TensorInfo* ptr) {
    SmallVector<TensorInfo*> inps(0);
    if (ptr->producer) {
        for (auto i : ptr->producer->inputs) {
            if (i && --i->ref_cnt == 0) {
                inps.push_back(i);
            }
        }
    }
    real_free(ptr);
    for (auto i : inps) {
        if (i->allow_delete) {
            recursive_free(i);
        }
    }
 }
 void ChannelImpl::real_free(TensorInfo* ptr) {
    MGB_LOCK_GUARD(m_mutex);
    if (m_channel_state.profiler->is_profiling()) {
        m_channel_state.profiler->record_host<TensorEraseEvent>(ptr->id);
    }
    if (ptr->size_exceeds_thd(m_worker_state.options.tensor_lowerbound)) {
        m_dtr.erase_candidate(ptr);
    }
    detach_users(ptr);
    ptr->detach_producer();
    m_pool.free(ptr);
 }
@@ -415,17 +448,24 @@ ChannelImpl::~ChannelImpl() {
    close();
 }
 void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr) {
    MGB_LOCK_GUARD(m_mutex);
    if (m_worker_state.profiler->is_profiling()) {
 void ChannelImpl::produce_tensor(TensorInfo* dest, TensorPtr ptr, bool notice=true) {
    auto lock = notice ? std::unique_lock<std::mutex>(m_mutex)
                       : std::unique_lock<std::mutex>();
    m_dtr.update_used_time(dest);
    if (notice && m_worker_state.profiler->is_profiling()) {
        m_worker_state.profiler->record_host<TensorProduceEvent>(dest->id, ptr->layout(), ptr->comp_node());
    }
    dest->value_fetched = ptr->value_fetched();
    // update tensor desc for static infer
    dest->desc.layout = ptr->layout();
    dest->desc.comp_node = ptr->comp_node();
    dest->memory = ptr->blob()->size();
    dest->ptr = std::move(ptr);
    if (m_waitee == dest) {
    dest->evict_type = EvictType::NONE;
    if (notice && dest->size_exceeds_thd(m_worker_state.options.tensor_lowerbound)) {
        m_dtr.insert_candidate(dest);
    }
    if (notice && m_waitee == dest) {
        m_cv.notify_all();
    }
 }
@@ -436,37 +476,86 @@ void ChannelImpl::release_tensor(TensorInfo* dest) {
 }
 void ChannelImpl::regenerate(TensorInfo* dest) {
    if (dest->evict_type == DROP) {
    if (dest->evict_type == EvictType::DROP) {
        recompute(dest->producer);
    } else if (dest->evict_type == SWAP) {
        swap_in(dest);
    } else if (dest->evict_type == EvictType::SWAP) {
        produce_tensor(dest, Tensor::make(dest->h_value));
    }
    mgb_assert(dest->evict_type == NONE);
 }
 void ChannelImpl::recompute(TensorInfo::ComputePath* path) {
    SmallVector<TensorInfo*> workspaces(path->outputs.size(), nullptr);
    for (auto&& input: path->inputs) {
        regenerate(input);
    SmallVector<TensorPtr> inputs;
    inputs.reserve(path->inputs.size());
    m_dtr.pin(path->inputs);
    for (auto i : path->inputs) {
        if (!i->ptr) {
            regenerate(i);
        }
        inputs.push_back(i->ptr);
        m_dtr.update_used_time(i);
    }
    if (m_worker_state.options.enable_auto_drop && m_worker_state.options.memory_budget > 0) {
        auto_evict();
    }
    auto outputs = OpDef::apply_on_physical_tensor(*path->op, inputs);
    m_dtr.estimate_timestamp += path->compute_time / 1e8;
    m_dtr.unpin(path->inputs);
    for (size_t i = 0;i < outputs.size();i ++) {
        auto&& o = path->outputs[i];
        if (o) {
            o->recompute_times ++;
            if (!o->ptr) {
                produce_tensor(o, std::move(outputs[i]), false);
                if (m_worker_state.options.enable_auto_drop) {
                    m_dtr.update_dsu_after_recompute(o);
                }
            }
        }
    }
    for (auto&& output: path->outputs) {
        if(output == nullptr) {
            continue;
 }
 void ChannelImpl::auto_evict() {
    if (!m_dtr.comp_node.valid()) {
        return;
    }
    size_t current_memory = m_dtr.comp_node.get_used_memory();
    while (current_memory > m_worker_state.options.memory_budget) {
        auto best = m_dtr.find_best_tensor();
        if (!best) {
            if (!m_dtr.warn_printed) {
                m_dtr.warn_printed = true;
                mgb_log_warn("No tensors on %s can be evicted automatically "
                             "when memory usage is %.0lfMB. Maybe memory "
                             "budget is too small.",
                              m_dtr.comp_node.to_string().c_str(),
                              current_memory / 1024.0 / 1024.0);
            }
            break;
        }
        if (best->ptr.unique() && best->ptr->blob().unique()) {
            current_memory -= best->memory;
        }
        do_drop(best);
        if (best->evict_type == EvictType::DROP) {
            m_dtr.update_dsu_after_evict(best);
        }
        output->evict_type = NONE;
    }
    m_buffer.enqueue(ApplyOp{path->op, path->inputs, path->outputs});
 }
 void ChannelImpl::detach_users(TensorInfo* dest) {
    SmallVector<TensorInfo::ComputePath*> users = dest->users;
    for (auto* user: users) {
        for (auto* output: user->outputs) {
        SmallVector<TensorInfo*> outputs = user->outputs;
        SmallVector<TensorInfo*> inputs = user->inputs;
        for (auto* output: outputs) {
            if (output == nullptr) {
                continue;
            }
            regenerate(output);
            output->detach_producer();
            for (auto* input: inputs) {
                input->ref_cnt --;
            }
        }
    }
    mgb_assert(dest->users.size() == 0);
@@ -524,6 +613,15 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                uint64_t apply_id = ++m_last_id;
                SmallVector<TensorPtr> tensor_inputs;
                SmallVector<CompNode> devices;
                if (m_worker_state.options.enable_auto_drop) {
                    m_dtr.pin(cmd.inputs);
                }
                for (auto i : cmd.inputs) {
                    if (!i->ptr && i->evict_type != EvictType::NONE) {
                        regenerate(i);
                    }
                    m_dtr.update_used_time(i);
                }
                tensor_inputs.reserve(cmd.inputs.size());
                // refcnt == 1, owners: [TensorInfo::ptr]
                for (auto i : cmd.inputs) {
@@ -569,6 +667,9 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                        m_worker_state.profiler->record_device<DeviceOpExecuteEvent>(device, event_data);
                    }
                }
                if (m_worker_state.options.enable_auto_drop && m_worker_state.options.memory_budget > 0) {
                    auto_evict();
                }
                // Apply op
                // Here std::move is REQUIRED for removing duplicated references.
                auto tensor_outputs = OpDef::apply_on_physical_tensor(
@@ -581,16 +682,78 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                    }
                }
                // End profiling operator
                double estimate_compute_time = 0;
                if (m_worker_state.options.enable_auto_drop) {
                    for (auto i : cmd.inputs) {
                        estimate_compute_time += i->memory;
                    }
                    for (auto i : tensor_outputs) {
                        estimate_compute_time += i->blob()->size();
                    }
                    m_dtr.estimate_timestamp += estimate_compute_time / 1e8;
                    for (auto i : cmd.outputs) {
                        i->compute_time = estimate_compute_time;
                        m_dtr.update_used_time(i);
                    }
                    if (cmd.outputs[0]->producer) {
                        cmd.outputs[0]->producer->compute_time = estimate_compute_time;
                    }
                    m_dtr.unpin(cmd.inputs);
                }
                mgb_assert(tensor_outputs.size() == cmd.outputs.size());
                for (size_t i = 0; i < tensor_outputs.size(); ++i) {
                    if (cmd.outputs[i] == nullptr) {
                        continue;
                    }
                    produce_tensor(cmd.outputs[i], std::move(tensor_outputs[i]));
                    if (m_worker_state.options.enable_auto_drop) {
                        cmd.outputs[i]->dsu_ptr = std::make_shared<DsuNode>(estimate_compute_time);
                    }
                }
                if (m_worker_state.options.enable_drop == 1
                    && m_worker_state.options.record_computing_path == 1){
                    bool is_inplace = false;
                    bool cross_cn = false;
                    for (auto input : cmd.inputs) {
                        for (auto output : cmd.outputs) {
                            if (input->ptr->blob()->storage() == output->ptr->blob()->storage()) {
                                is_inplace = true;
                                break;
                            }
                        }
                    }
                    for (auto input : cmd.inputs) {
                        if (input->ptr->comp_node() != m_dtr.comp_node) {
                            cross_cn = true;
                            break;
                        }
                    }
                    for (auto output : cmd.outputs) {
                        if (output->ptr->comp_node() != m_dtr.comp_node) {
                            cross_cn = true;
                            break;
                        }
                    }
                    if (!is_inplace && !cross_cn) {
                        TensorInfo::ComputePath::make(cmd.op, cmd.inputs, cmd.outputs);
                        size_t detach_cnt = 0;
                        for (auto output : cmd.outputs) {
                            if (!output->size_exceeds_thd(m_worker_state.options.tensor_lowerbound)) {
                                output->detach_producer();
                                detach_cnt ++;
                            }
                        }
                        for (auto input : cmd.inputs) {
                            input->ref_cnt -= detach_cnt;
                        }
                    }
                }
            } else if constexpr (std::is_same_v<T, Del>) {
                free(cmd.dest);
            } else if constexpr (std::is_same_v<T, GetValue>) {
                if (!cmd.dest->ptr && cmd.dest->evict_type != EvictType::NONE) {
                    regenerate(cmd.dest);
                }
                mgb_assert(cmd.dest->ptr, "Invalid tensor ptr!");
                cmd.dest->ptr->fetch_value();
                MGB_LOCK_GUARD(m_mutex);
@@ -602,9 +765,12 @@ void ChannelImpl::process_one_task(IdentifiedCommand& icmd) {
                produce_tensor(cmd.dest, Tensor::make(cmd.dest->h_value));
            } else if constexpr (std::is_same_v<T, SwapOut>) {
                cmd.dest->h_value = cmd.dest->ptr->get_value();
                release_tensor(cmd.dest);
                if (cmd.dest->evict_type == EvictType::NONE) {
                    release_tensor(cmd.dest);
                    cmd.dest->evict_type = EvictType::SWAP;
                }
            } else if constexpr (std::is_same_v<T, Drop>) {
                release_tensor(cmd.dest);
                do_drop(cmd.dest, true);
            } else if constexpr (std::is_same_v<T, SetOption>) {
                m_worker_state.options.set_option(cmd.key, cmd.value);
            } else if constexpr (std::is_same_v<T, StartProfile>) {
@@ -833,3 +999,111 @@ void ChannelImpl::assert_in_channel() {
 void ChannelImpl::assert_in_worker() {
    mgb_assert(m_worker_state.tid == std::this_thread::get_id());
 }
 void ChannelImpl::DynamicSublinear::pin(const SmallVector<TensorInfo*>& vec) {
    for (auto i : vec) {
        i->pin();
    }
 }
 void ChannelImpl::DynamicSublinear::unpin(const SmallVector<TensorInfo*>& vec) {
    for (auto i : vec) {
        i->unpin();
    }
 }
 void ChannelImpl::DynamicSublinear::update_dsu_after_recompute(TensorInfo* ptr) {
    auto&& dsu_fa = find_father(ptr->dsu_ptr);
    dsu_fa->t -= ptr->compute_time;
    ptr->dsu_ptr->parent.reset();
    ptr->dsu_ptr->t = ptr->compute_time;
 }
 void ChannelImpl::DynamicSublinear::update_dsu_after_evict(TensorInfo* ptr) {
    for (auto i : ptr->producer->inputs) {
        if (i->evict_type == EvictType::DROP) {
            merge(i->dsu_ptr, ptr->dsu_ptr);
        }
    }
    for (auto i : ptr->producer->outputs) {
        if (i && i->evict_type == EvictType::DROP) {
            merge(ptr->dsu_ptr, i->dsu_ptr);
        }
    }
 }
 double ChannelImpl::DynamicSublinear::estimate_neighbor_cost(TensorInfo* ptr) {
    double cost = 0;
    for (auto i : ptr->producer->inputs) {
        if (i->evict_type == EvictType::DROP) {
            double t = find_father(i->dsu_ptr)->t;
            if (t < i->compute_time) {
                t = i->compute_time;
            }
            cost += t;
        }
    }
    for (auto i : ptr->producer->outputs) {
        if (i && i->evict_type == EvictType::DROP) {
            double t = find_father(i->dsu_ptr)->t;
            if (t < i->compute_time) {
                t = i->compute_time;
            }
            cost += t;
        }
    }
    return cost;
 }
 TensorInfo* ChannelImpl::DynamicSublinear::find_best_tensor() {
    double min_msps = -1;
    TensorInfo* best = nullptr;
    for (auto i : candidates) {
        if (i->producer && i->ptr && !i->pinned && i->evict_type == EvictType::NONE) {
            double neighbor_cost = estimate_neighbor_cost(i);
            size_t begin_ptr = reinterpret_cast<size_t>(i->ptr->blob()->storage().get());
            auto side_info = i->ptr->comp_node().get_free_left_and_right(begin_ptr, begin_ptr + i->ptr->blob()->size());
            double free_mem = side_info.first + side_info.second;
            double msps = i->eval_func(neighbor_cost, free_mem, estimate_timestamp, 1.0, 1.0, 1.0, 1.0001);
            if (min_msps < 0 || msps < min_msps) {
                min_msps = msps;
                best = i;
            }
        }
    }
    return best;
 }
 void ChannelImpl::DynamicSublinear::merge(std::shared_ptr<DsuNode> &x, std::shared_ptr<DsuNode> &y) {
    auto&& f_x = find_father(x);
    auto&& f_y = find_father(y);
    if (f_x.get() == f_y.get()) {
        return;
    }
    f_y->t += f_x->t;
    f_x->parent = f_y;
 }
 std::shared_ptr<DsuNode> ChannelImpl::DynamicSublinear::find_father(std::shared_ptr<DsuNode>& x) {
    if (x->is_root()) {
        return x;
    } else {
        auto&& fa = find_father(x->parent);
        return x->parent = fa;
    }
 }
 void ChannelImpl::DynamicSublinear::insert_candidate(TensorInfo* ptr) {
    candidates.insert(ptr);
    if (!comp_node.valid()) {
        comp_node = ptr->ptr->comp_node();
    }
 }
 void ChannelImpl::DynamicSublinear::erase_candidate(TensorInfo* ptr) {
    candidates.erase(ptr);
 }
 void ChannelImpl::DynamicSublinear::update_used_time(TensorInfo* ptr) {
    ptr->last_used_time = estimate_timestamp;
 }
--- a/imperative/src/impl/interpreter/interpreter_impl.h
+++ b/imperative/src/impl/interpreter/interpreter_impl.h
@@ -63,8 +63,8 @@ struct ChannelImpl : Interpreter::Channel {
    void sync() override;
    void close() override;
    int get_option(std::string name) override;
    void set_option(std::string name, int value) override;
    size_t get_option(std::string name) override;
    void set_option(std::string name, size_t value) override;
    void start_profile(std::unordered_map<std::string, int> option) override;
    void stop_profile(std::string basename, std::string format) override;
@@ -74,18 +74,22 @@ struct ChannelImpl : Interpreter::Channel {
 private:
    TensorInfo* alloc();
    void free(TensorInfo*);
    void real_free(TensorInfo*);
    void recursive_free(TensorInfo*);
    void do_drop(TensorInfo*, bool);
    void detach_users(TensorInfo*);
    void process_one_task(IdentifiedCommand&);
    void check_worker_exc_unsafe();
    void produce_tensor(TensorInfo* dest, TensorPtr ptr);
    void produce_tensor(TensorInfo* dest, TensorPtr ptr, bool notice);
    void release_tensor(TensorInfo* dest);
    void regenerate(TensorInfo* dest);
    void recompute(TensorInfo::ComputePath* path);
    void dispatch_default_cpu(
        std::shared_ptr<OpDef> op,
@@ -180,7 +184,6 @@ private:
    //! level 1: user side errors are sync;
    //! level 0: both sync.
    int m_async_level = 2;
    int m_max_recompute_time = 1;
    struct State {
        std::thread::id tid;
@@ -201,6 +204,112 @@ private:
    ChannelState m_channel_state;
    WorkerState m_worker_state;
    /*!
     * \brief A framework of dynamic sublienar memory optimization
     *
     * Note: The main idea is that during the training process, if the memory
     * usage exceeds the threshold, select some tensors to evict until the
     * memory usage is below the threshold.
     */
    struct DynamicSublinear {
        /*!
         * \brief find an available tensor with the largest evaluation function
         *
         * Note: An available tensor must satisfy: (1) has computing path,
         * (2) is in memory, (3) is not pinned. Evaluation function refers to:
         * @see: TensorInfo::eval_func.
         *
         * \return the pointer of the best tensor; nullptr is returned if no 
         * available tensor is found
         */
        TensorInfo* find_best_tensor();
        /*!
         * \brief estimate the cost of recomputing tensor ptr
         *
         * Note: We define the cost as the sum of the costs of each evicted
         * components where all the neighbors of ptr are located.
         */
        double estimate_neighbor_cost(TensorInfo* ptr);
        /*!
         * \brief update the last used time of the tensor ptr
         */
        void update_used_time(TensorInfo* ptr);
        /*!
         * \brief merge the two specified sets (the set in which the element x
         * is located, and the set in which the element y is located)
         */
        void merge(std::shared_ptr<DsuNode> &x, std::shared_ptr<DsuNode> &y);
        /*!
         * \brief return the representative of the set that contains the
         * element x
         */
        std::shared_ptr<DsuNode> find_father(std::shared_ptr<DsuNode> &x);
        /*!
         * \brief update DSU after recomputing tensor ptr
         *
         * Delete ptr from the set where ptr is located. Since DSU does not
         * support this operation, instead, we reset the DSU father of ptr, and
         * subtract the recomputation cost of ptr from the cost of the original
         * set.
         */
        void update_dsu_after_recompute(TensorInfo* ptr);
        /*!
         * \brief update DSU after evicting tensor ptr
         *
         * Check the neighbors of x, that is, the input and output tensors, and
         * if they are evicted, merge their respective sets.
         */
        void update_dsu_after_evict(TensorInfo* ptr);
        /*!
         * \brief pin the tensors in vec
         */
        void pin(const SmallVector<TensorInfo*>& vec);
        /*!
         * \brief unpin the tensors in vec
         */
        void unpin(const SmallVector<TensorInfo*>& vec);
        /*!
         * \brief add the tensor to the candidate set
         *
         * If the size of the tensor does not exceed the minimum threshold,
         * it will do nothing.
         */
        void insert_candidate(TensorInfo* ptr);
        /*!
         * \brief erase the tensor from the candidate set
         *
         * If the size of the tensor does not exceed the minimum threshold,
         * it will do nothing.
         */
        void erase_candidate(TensorInfo* ptr);
        //! estimate the current time, in order to reduce the overhead of timer
        double estimate_timestamp = 0;
        //! the comp node where dynamic sublinear memory optimization works
        CompNode comp_node;
        //! store all tensors that may be evicted
        std::unordered_set<TensorInfo*> candidates;
        //! whether the warning message has been printed
        bool warn_printed = false;
    } m_dtr;
    //! automatically evict an optimal tensor
    void auto_evict();
 };
 } // namespace mgb::imperative::interpreter::intl
--- a/imperative/src/impl/interpreter/option_manager.h
+++ b/imperative/src/impl/interpreter/option_manager.h
@@ -20,10 +20,10 @@ namespace mgb::imperative::interpreter::intl {
 struct OptionManager {
 private:
    std::unordered_map<std::string, int*> m_option_map = {};
    std::unordered_map<std::string, size_t*> m_option_map = {};
 public:
 #define DEF_OPTION(name, env_key, default_value, desc) \
    int name = (m_option_map[#name]=&name, get_option_from_env(env_key, default_value));
    size_t name = (m_option_map[#name]=&name, get_option_from_env(env_key, default_value));
    DEF_OPTION(async_level,             "MEGENGINE_INTERP_ASYNC_LEVEL",     2,
        "config whether raise error exactly when invoking op.\n"
@@ -39,20 +39,26 @@ public:
        "set command buffer length.");
    DEF_OPTION(enable_host_compute,     "MEGENGINE_HOST_COMPUTE",           1,
        "enable host compute, thus computation may be done in host event if it's device is gpu.");
    DEF_OPTION(enable_auto_drop,        "MEGENGINE_AUTO_DROP",              0, "");
    DEF_OPTION(memory_budget,           "MEGENGINE_MEMORY_BUDGET",          0,
        "auto drop will start whenever gpu memory usage exceeds this value.");
    DEF_OPTION(tensor_lowerbound,       "MEGENGINE_TENSOR_LOWERBOUND",      1048576,
        "the minimum memory value of a tensor added to the candidate set");
    DEF_OPTION(record_computing_path,   "MEGENGINE_RECORD_COMPUTING_PATH",  0, "");
 #undef DEF_OPTION
    void set_option(const std::string& name, int value) {
    void set_option(const std::string& name, size_t value) {
        *m_option_map[name] = value;
    }
    int get_option(const std::string& name) const {
    size_t get_option(const std::string& name) const {
        return *m_option_map.at(name);
    }
    static int get_option_from_env(const std::string& name, int default_value) {
    static size_t get_option_from_env(const std::string& name, size_t default_value) {
        if (const char* env_val = MGB_GETENV(name.c_str())) {
            default_value = std::atoi(env_val);
            sscanf(env_val, "%zu", &default_value);
        }
        return default_value;
    }
--- a/imperative/src/impl/interpreter/tensor_info.h
+++ b/imperative/src/impl/interpreter/tensor_info.h
@@ -25,6 +25,24 @@ enum EvictType {
    DROP = 2,
 };
 /*!
 * \brief an identifier to specify a component of evicted tensors
 * 
 * Each component tracks the sum of the compute costs of its elements, with the
 * union of two components having the sum of each constituent cost.
 */
 struct DsuNode {
    DsuNode(double _t): t(_t) {}
    std::shared_ptr<DsuNode> parent;
    bool is_root() {
        return !bool(parent);
    }
    double t;
 };
 struct TensorInfo;
 using TensorInfoPtr = std::shared_ptr<TensorInfo>;
@@ -37,6 +55,10 @@ struct TensorInfo {
    TensorPtr ptr;
    LogicalTensorDesc desc;
    double compute_time;
    size_t memory;
    double last_used_time;
    // FIXME: broken by drop
    bool value_fetched = false;
    bool invalid = false;
@@ -49,12 +71,15 @@ struct TensorInfo {
    // reserved for auto drop
    size_t pinned = 0;
    size_t recompute_times = 0;
    size_t ref_cnt = 0;
    std::shared_ptr<DsuNode> dsu_ptr;
    struct ComputePath {
        std::shared_ptr<OpDef> op;
        SmallVector<TensorInfo*> inputs;
        SmallVector<TensorInfo*> unique_inputs;
        SmallVector<TensorInfo*> outputs;
        double compute_time = 0;
        size_t ref_cnt() {
            return outputs.size() - std::count(outputs.begin(), outputs.end(), nullptr);
@@ -78,9 +103,19 @@ struct TensorInfo {
            for (auto output: outputs) {
                output->producer = path;
            }
            // update ref_cnt
            for (auto input: inputs) {
                input->ref_cnt += outputs.size();
            }
            return path;
        }
    }* producer = nullptr;
    double eval_func(double cost, double free_mem, double cur_time,
                     double param_cost, double param_mem, double param_time, double param_recompute_times) {
        return pow(cost + 1e-3, param_cost) * pow(param_recompute_times, (double)recompute_times)
               / (pow((memory + free_mem) / 1024.0 / 1024.0, param_mem) * pow((double)(cur_time - last_used_time + 1e-3), param_time));
    }
    void pin() {
        ++pinned;
@@ -106,6 +141,10 @@ struct TensorInfo {
        producer = nullptr;
    }
    bool size_exceeds_thd(size_t thd) {
        return memory > thd;
    }
    SmallVector<ComputePath*> users;
 };
 }
--- a/imperative/src/include/megbrain/imperative/interpreter.h
+++ b/imperative/src/include/megbrain/imperative/interpreter.h
@@ -44,8 +44,8 @@ struct Interpreter {
        virtual void sync() = 0;
        virtual void close() = 0;
        virtual int get_option(std::string name) = 0;
        virtual void set_option(std::string name, int value) = 0;
        virtual size_t get_option(std::string name) = 0;
        virtual void set_option(std::string name, size_t value) = 0;
        virtual void start_profile(std::unordered_map<std::string, int> option) = 0;
        virtual void stop_profile(std::string basename, std::string format) = 0;