feat(bazel/windows/xp/sp2/inference): implement inference on windows xp

(os vesion >= sp2) build with bazel * bazel build support(define __DEPLOY_ON_XP_SP2__ when deploy on xp sp2): (dbg)./bazel build //brain/megbrain:load_and_run --cpu='x86_windows_xp' --compiler='clang_cl' -c dbg --copt "-D__DEPLOY_ON_XP_SP2__=1" (opt)./bazel build //brain/megbrain:load_and_run --cpu='x86_windows_xp' --compiler='clang_cl' -c opt --copt "-D__DEPLOY_ON_XP_SP2__=1" * internal behavior: will define MGB_HAVE_THREAD=0 when enable __DEPLOY_ON_XP_SP2__ * refer to https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160 xp sp2(x86) do not support vc runtime fully, casused by KERNEL32.dll do not implement some base apis for c++ std function, for example, std::mutex/std::thread/std::condition_variable as a workround, we will disable some MegEngine features on xp sp2 env, for exampe, multi-thread etc! * about DNN_MUTEX/MGB_MUTEX, if your code will build in inference code (even CPU backends), please replace std::mutex to DNN_MUTEX/MGB_MUTEX, * about multi-thread, if you code need multi-thread support, please enable it when MGB_HAVE_THREAD=1 * about test build env status 1: Visual Studio 2019(MSVC version <= 14.26.28801)---- pass 2: Visual Studio 2019(MSVC version > 14.26.28801) ---- failed caused by this 'new' version will put VCR depends on win7 KERNEL32.DLL, this may be fixed at Visual Studio 2019 later version but we do not test at this MR merge point 3: Visual Studio 2017 ---------- pass 4: Visual Studio 2014 ---------- pass GitOrigin-RevId: 65ac48b95e
3 years ago · c68e669530
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -60,10 +60,10 @@ T deserialize_pod(const std::string& data, size_t& offset) {
 ErrorHandler* ErrorHandler::sm_inst;
 ErrorHandler* ErrorHandler::inst() {
    static std::mutex mtx;
    static DNN_MUTEX mtx;
    static DefaultErrorHandler default_handler;
    if (megdnn_unlikely(!sm_inst)) {
        std::lock_guard<std::mutex> lg{mtx};
        MEGDNN_LOCK_GUARD(mtx);
        if (!sm_inst) {
            sm_inst = &default_handler;
        }
--- a/dnn/src/common/cv/interp_helper.cpp
+++ b/dnn/src/common/cv/interp_helper.cpp
@@ -145,7 +145,7 @@ init_inter_tab_1d(InterpolationMode imode, float* tab, int tabsz) {
 #if MEGDNN_X86
 DEF_FUN(const int16_t*) get_linear_ic4_table() {
    auto table_holder = &sm_tab_linear;
    std::lock_guard<std::mutex> lg{table_holder->mtx};
    MEGDNN_LOCK_GUARD(table_holder->mtx);
    float* tab = nullptr;
    short* itab = nullptr;
    MEGDNN_MARK_USED_VAR(tab);
@@ -175,7 +175,7 @@ DEF_FUN(const void*) get_table(InterpolationMode imode, bool fixpt) {
        default:
            megdnn_throw(("unsupported interpolation mode"));
    }
    std::lock_guard<std::mutex> lg{table_holder->mtx};
    MEGDNN_LOCK_GUARD(table_holder->mtx);
    float* tab = nullptr;
    short* itab = nullptr;
--- a/dnn/src/common/cv/interp_helper.h
+++ b/dnn/src/common/cv/interp_helper.h
@@ -134,7 +134,7 @@ private:
    };
    struct TableHolderBase {
        std::mutex mtx;
        DNN_MUTEX mtx;
        //! get table pointer; return whether already init
        virtual bool get(float**, int16_t**) = 0;
--- a/dnn/src/common/elemwise/opr_impl.cpp
+++ b/dnn/src/common/elemwise/opr_impl.cpp
@@ -39,10 +39,10 @@ using Mode = param::Elemwise::Mode;
 using ModeTrait = ElemwiseForward::ModeTrait;
 const ModeTrait& ModeTrait::from_mode(Mode mode) {
    static std::mutex mtx;
    static DNN_MUTEX mtx;
    static std::vector<ModeTrait> traits;
    std::lock_guard<std::mutex> _lock(mtx);
    MEGDNN_LOCK_GUARD(mtx);
    if (traits.empty()) {
        auto get = [&](Mode m) -> ModeTrait& {
--- a/dnn/src/common/elemwise_multi_type/opr_impl.cpp
+++ b/dnn/src/common/elemwise_multi_type/opr_impl.cpp
@@ -28,10 +28,10 @@ void check_dtype(const ModeTrait& trait, size_t i, const TensorLayout& src) {
 }  // anonymous namespace
 const ModeTrait& ModeTrait::from_mode(Mode mode) {
    static std::mutex mtx;
    static DNN_MUTEX mtx;
    static std::vector<ModeTrait> traits;
    std::lock_guard<std::mutex> _lock(mtx);
    MEGDNN_LOCK_GUARD(mtx);
    auto make_check_dtype_func = [](DType expected) {
        auto func = [expected](DType dtype) {
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -70,7 +70,7 @@ protected:
        MIDOUT_BEGIN(dnn_src_common_handle_impl, Opr, idx) {
            static_assert(idx < NR_HELPER_OPRS, "invalid idx");
            if (!self->m_helper_oprs[idx]) {
                std::lock_guard<std::mutex> lg{self->m_helper_oprs_mtx};
                MEGDNN_LOCK_GUARD(self->m_helper_oprs_mtx);
                if (!self->m_helper_oprs[idx]) {
                    self->m_helper_oprs[idx] =
                            self->template create_operator<Opr>();
@@ -88,7 +88,7 @@ protected:
 private:
    std::array<std::unique_ptr<OperatorBase>, NR_HELPER_OPRS> m_helper_oprs;
    std::mutex m_helper_oprs_mtx;
    DNN_MUTEX m_helper_oprs_mtx;
 };
 }  // namespace megdnn
--- a/dnn/src/common/opr_delegate.h
+++ b/dnn/src/common/opr_delegate.h
@@ -38,7 +38,7 @@ const std::shared_ptr<Handle>& inplace_cpu_handle(int debug_level = 0);
 */
 template <int nr_opr = 1>
 class CpuOprDelegationStorage {
    std::mutex m_mtx;
    DNN_MUTEX m_mtx;
    std::shared_ptr<Handle> m_handle;
    std::unique_ptr<OperatorBase> m_oprs[nr_opr];
--- a/dnn/src/common/tensor_format.cpp
+++ b/dnn/src/common/tensor_format.cpp
@@ -604,7 +604,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec(
 TensorFormat Image2DPack4TensorFormat::make_raw(
        size_t align_axis, size_t align_size_in_elements,
        Handle::HandleVendorType vendor_type) {
    static std::mutex mtx;
    static DNN_MUTEX mtx;
    static std::unordered_map<uint64_t,
                              std::unique_ptr<Image2DPack4TensorFormat>>
            cache;
@@ -641,7 +641,7 @@ TensorFormat Image2DPack4TensorFormat::change_axis(size_t axis) const {
 /* ===================== LowbitsitsAlignedToBytesTensorFormat
 * ===================== */
 TensorFormat LowbitsAlignedToBytesTensorFormat::make(size_t size_nbits) {
    static std::mutex mtx;
    static DNN_MUTEX mtx;
    static std::unordered_map<
            uint64_t, std::unique_ptr<LowbitsAlignedToBytesTensorFormat>>
            cache;
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -118,8 +118,17 @@
 #define megdnn_layout_msg(layout) \
    std::string(#layout "=" + (layout).to_string())
 #define MEGDNN_LOCK_GUARD(var) \
    std::lock_guard<std::remove_cv_t<decltype(var)>> _lock_guard_##var { var }
 #if __DEPLOY_ON_XP_SP2__
 #define DNN_MUTEX size_t
 #define MEGDNN_LOCK_GUARD(var) MEGDNN_MARK_USED_VAR(var)
 #else
 #define DNN_MUTEX std::mutex
 #define DNN_TOKENPASTE(x, y) x##y
 #define DNN_TOKENPASTE2(x, y) DNN_TOKENPASTE(x, y)
 #define DNN_LOCK_GUARD_CTOR(mtx) DNN_TOKENPASTE2(__lock_guard_, __LINE__)(mtx)
 #define MEGDNN_LOCK_GUARD(mtx) \
    std::lock_guard<decltype(mtx)> DNN_LOCK_GUARD_CTOR(mtx)
 #endif
 namespace megdnn {
@@ -487,7 +496,7 @@ struct _SafeMultipliesImplUnsigned : public std::binary_function<T, T, T> {
                "implicit conversion disallowed in SafeMultiplies");
        megdnn_trap();
    }
 };
 };  // namespace megdnn
 template <>
 struct SafeMultiplies<size_t> : public _SafeMultipliesImplUnsigned<size_t> {};
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h
@@ -81,7 +81,7 @@ public:
    }
 private:
    std::mutex m_mtx;
    DNN_MUTEX m_mtx;
    std::unordered_map<StrategyHashKey, std::unique_ptr<T>, StrategyHasher,
                       StrategyHashKeyEqual>
            m_map_strategies;
@@ -99,4 +99,4 @@ MatrixMulImpl::KernSizeParam get_matmul_kern_param(
 }  // namespace fallback
 }  // namespace megdnn
 // vim: syntax=cpp.doxygen
 // vim: syntax=cpp.doxygen
--- a/dnn/src/fallback/conv_bias/im2col/factory.h
+++ b/dnn/src/fallback/conv_bias/im2col/factory.h
@@ -110,7 +110,7 @@ struct StrategyHashParamEqual {
 };
 class StrategyDelegationStorage {
    std::mutex m_mtx;
    DNN_MUTEX m_mtx;
    std::unordered_map<StrategyHashParam, std::unique_ptr<StrategyBase>,
                       StrategyHashParamHash, StrategyHashParamEqual>
            map_strategys;
--- a/dnn/src/naive/sleep/opr_impl.cpp
+++ b/dnn/src/naive/sleep/opr_impl.cpp
@@ -11,6 +11,10 @@
 #include "./opr_impl.h"
 #if __DEPLOY_ON_XP_SP2__
 #define MEGDNN_NO_THREAD 1
 #endif
 #include "src/naive/handle.h"
 #if !MEGDNN_NO_THREAD
 #include <thread>
@@ -20,10 +24,10 @@ namespace megdnn {
 namespace naive {
 void SleepForwardImpl::exec() {
    double seconds = m_param.time;
 #if MEGDNN_NO_THREAD
    megdnn_trap();
 #else
    double seconds = m_param.time;
    MEGDNN_DISPATCH_CPU_KERN_OPR(
            std::this_thread::sleep_for(std::chrono::microseconds(
                    static_cast<uint64_t>(seconds * 1e6))););
--- a/sdk/load-and-run/src/infile_persistent_cache.h
+++ b/sdk/load-and-run/src/infile_persistent_cache.h
@@ -52,7 +52,7 @@ class InFilePersistentCache final : public PersistentCache {
    std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage,
                                                       BlobStorage::Hash>>
            m_cache;
    std::mutex m_mtx;
    MGB_MUTEX m_mtx;
    template <typename Input>
    void read_cache(Input& inp);
--- a/src/core/impl/comp_node/comp_node.cpp
+++ b/src/core/impl/comp_node/comp_node.cpp
@@ -32,7 +32,7 @@ namespace {
    std::atomic_flag
        g_default_cpu_initialized,
        g_exit_handler_registered[CompNode::NR_DEVICE_TYPE];
    std::mutex g_device_map_mtx;
    MGB_MUTEX g_device_map_mtx;
    ThinHashMap<CompNode::DeviceType, ThinHashMap<int, int>> g_device_map;
    CompNode::DeviceType g_unspec_locator_type;
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
@@ -60,7 +60,11 @@ class CpuCompNode::WorkerQueue final
            sys::set_cpu_affinity({m_locator.device});
 #endif
        }
 #if __DEPLOY_ON_XP_SP2__
        __builtin_trap();
 #else
        sys::set_thread_name(m_locator.to_string());
 #endif
    }
    void on_sync_all_task_finish() override {
@@ -830,7 +834,9 @@ struct CpuCompNode::Pool {
        void operator()(CompNodeRecorderImpl* p) { p->~CompNodeRecorderImpl(); }
    };
 #if !__DEPLOY_ON_XP_SP2__
    std::recursive_mutex mtx;
 #endif
    // use global memory pool to ensuare object memory accessible even after
    // global finalize
    std::aligned_storage_t<sizeof(CompNodeRecorderImpl),
@@ -862,7 +868,9 @@ void CpuCompNode::foreach (thin_function<void(CompNode)> callback) {
    for (size_t i = 0;; ++i) {
        CompNode cur;
        {
 #if !__DEPLOY_ON_XP_SP2__
            MGB_LOCK_GUARD(sm_pool->mtx);
 #endif
            if (i >= sm_pool->nr_used_impl_storage)
                return;
            cur = make_comp_node_from_impl(
@@ -909,7 +917,9 @@ CpuCompNode::Impl* CpuCompNode::load_cpu(Locator locator,
                       locator.device == Locator::DEVICE_MULTITHREAD_DEFAULT,
               "failed to load cpu for device:%d stream:%d", locator.device,
               locator.stream);
 #if !__DEPLOY_ON_XP_SP2__
    MGB_LOCK_GUARD(sm_pool->mtx);
 #endif
    // encode both device ID and type into a int
    mgb_assert(locator_logical.device >= -1 ||
@@ -967,7 +977,9 @@ void CpuCompNode::sync_all() {
    if (!sm_pool)
        return;
 #if !__DEPLOY_ON_XP_SP2__
    MGB_LOCK_GUARD(sm_pool->mtx);
 #endif
    for (auto&& i : sm_pool->locator2impl)
        i.second->sync();
    for (auto&& i : sm_pool->locator2impl_multi_thread)
@@ -1049,7 +1061,9 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(
    auto waiter = [this, version]() {
        while (m_record_nr_finish.load(std::memory_order_acquire) < version) {
 #if !__DEPLOY_ON_XP_SP2__
            std::unique_lock<std::mutex> lk{m_dev_wait_mtx};
 #endif
            if (m_record_nr_finish.load(std::memory_order_acquire) >= version) {
                break;
            }
@@ -1078,10 +1092,12 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::on_finish() {
    }
    m_record_nr_finish.fetch_add(1, std::memory_order_release);
 #if !__DEPLOY_ON_XP_SP2__
    if (m_dev_wait_nr_waiter.load(std::memory_order_acquire)) {
        MGB_LOCK_GUARD(m_dev_wait_mtx);
        m_dev_wait_cv.notify_all();
    }
 #endif
 }
 bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() {
@@ -1100,11 +1116,15 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
    m_dev_wait_nr_waiter.fetch_add(1, std::memory_order_release);
    for (;;) {
 #if !__DEPLOY_ON_XP_SP2__
        std::unique_lock<std::mutex> lock{m_dev_wait_mtx};
 #endif
        if (finished()) {
            break;
        }
 #if !__DEPLOY_ON_XP_SP2__
        m_dev_wait_cv.wait(lock);
 #endif
    }
    m_dev_wait_nr_waiter.fetch_sub(1, std::memory_order_release);
 }
--- a/src/core/impl/comp_node/impl_helper.cpp
+++ b/src/core/impl/comp_node/impl_helper.cpp
@@ -45,9 +45,17 @@ void CompNodeImplHelper::EventImplHelper::host_wait() {
        return;
    }
    if (sm_cpu_sync_level >= 1) {
 #if __DEPLOY_ON_XP_SP2__
 #if MGB_HAVE_THREAD
        __builtin_trap();
 #else
        return;
 #endif
 #else
        while (!finished()) {
            std::this_thread::yield();
        }
 #endif
        return;
    }
    mgb_assert(!sm_cpu_sync_level, "invalid cpu sync level: %d",
@@ -57,9 +65,17 @@ void CompNodeImplHelper::EventImplHelper::host_wait() {
 }
 void CompNodeImplHelper::EventImplHelper::host_wait_cv() {
 #if __DEPLOY_ON_XP_SP2__
 #if MGB_HAVE_THREAD
    __builtin_trap();
 #else
    return;
 #endif
 #else
    while (!finished()) {
        std::this_thread::yield();
    }
 #endif
 }
 double CompNodeImplHelper::EventImplHelper::elapsed_time_until(Event& end_) {
--- a/src/core/impl/comp_node/impl_helper.h
+++ b/src/core/impl/comp_node/impl_helper.h
@@ -49,7 +49,7 @@ namespace mgb {
     * been performed.
     */
    class CompNodeImplHelper::EventImplHelper: public Event {
        std::mutex m_mtx;
        MGB_MUTEX m_mtx;
        bool m_recorded = false, m_finished = false;
--- a/src/core/impl/comp_node/mem_alloc/impl.cpp
+++ b/src/core/impl/comp_node/mem_alloc/impl.cpp
@@ -59,11 +59,15 @@ MemAllocImplHelper::MemAddr MemAllocImplHelper::do_alloc(
        size_t size, bool allow_from_parent, bool log_stat_on_error) {
    mgb_assert(size);
 #if !__DEPLOY_ON_XP_SP2__
    m_mutex.lock();
 #endif
    auto iter = m_free_blk_size.lower_bound(FreeBlock{MemAddr{0, 0}, size});
    if (iter == m_free_blk_size.end()) {
 #if !__DEPLOY_ON_XP_SP2__
        m_mutex.unlock();
 #endif
        if (!allow_from_parent) {
            if (log_stat_on_error) {
                print_memory_state();
@@ -87,7 +91,9 @@ MemAllocImplHelper::MemAddr MemAllocImplHelper::do_alloc(
    if (remain)
        insert_free_unsafe({alloc_addr + size, remain});
 #if !__DEPLOY_ON_XP_SP2__
    m_mutex.unlock();
 #endif
    return alloc_addr;
 }
@@ -267,7 +273,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) {
            {
                // sleep to wait for async dealloc
                using namespace std::literals;
 #if !__DEPLOY_ON_XP_SP2__
                std::this_thread::sleep_for(0.2s);
 #endif
            }
            get = gather_stream_free_blk_and_release_full();
            mgb_log("device %d: sync all device and try to "
--- a/src/core/impl/comp_node/mem_alloc/impl.h
+++ b/src/core/impl/comp_node/mem_alloc/impl.h
@@ -73,7 +73,7 @@ class MemAllocImplHelper: virtual public MemAllocBase {
        //! map from address to size and size iter
        std::map<size_t, FreeBlockAddrInfo> m_free_blk_addr;
        std::mutex m_mutex;
        MGB_MUTEX m_mutex;
        struct BlkByAddrIter {
            decltype(m_free_blk_addr.begin()) aiter;
--- a/src/core/impl/graph/cg_impl_seq.cpp
+++ b/src/core/impl/graph/cg_impl_seq.cpp
@@ -48,7 +48,11 @@ class ComputingGraphImpl::ComputingSequence::ExecContext {
    std::unique_ptr<CompNodeSeqRecorder> m_recorder;
    bool has_var_sanity_check() const {
 #if __DEPLOY_ON_XP_SP2__
        return false;
 #else
        return static_cast<bool>(m_comp_seq->m_var_sanity_check);
 #endif
    }
    void try_reset_recorder() {
@@ -305,10 +309,12 @@ void ComputingGraphImpl::ComputingSequence::preprocess(ExecContext* ctx) {
            m_owner_graph->var_node_mem_manager().alloc_var_node_mem_static();
    bool first_exec = m_first_exec;
 #if !__DEPLOY_ON_XP_SP2__
    if (!first_exec) {
        // var sanity check only for first run
        m_var_sanity_check.reset();
    }
 #endif
    m_owner_graph->event().signal_inplace<event::CompSeqExecBeforeStart>(
            m_owner_graph, this, &ctx->m_cleanup_callback, &m_used_comp_node,
@@ -342,9 +348,13 @@ void ComputingGraphImpl::ComputingSequence::attach_to_graph() {
                static_cast<ComputingSequence*>(gimpl->m_current_comp_seq);
        prev_seq->cleanup();
    }
 #if !__DEPLOY_ON_XP_SP2__
    //! disable VarSanityCheck when __DEPLOY_ON_XP_SP2__=1. caused by
    //! VarSanityCheck depends on std::thread
    if (gimpl->options().var_sanity_check_first_run) {
        m_var_sanity_check = std::make_unique<VarSanityCheck>(gimpl);
    }
 #endif
    gimpl->m_current_comp_seq = this;
 }
@@ -403,7 +413,9 @@ void ComputingGraphImpl::ComputingSequence::do_wait(bool explicit_user_wait) {
 }
 void ComputingGraphImpl::ComputingSequence::cleanup() {
 #if !__DEPLOY_ON_XP_SP2__
    m_var_sanity_check.reset();
 #endif
    if (has_uncaught_exception()) {
        mgb_log_warn(
                "fallback to simple graph waiting in dtor due to uncaught "
--- a/src/core/impl/graph/cg_impl_seq.h
+++ b/src/core/impl/graph/cg_impl_seq.h
@@ -30,7 +30,9 @@ class ComputingGraphImpl::ComputingSequence final : public AsyncExecutable {
    size_t m_run_id = 0;
    size_t m_cg_event_version = 0;
    mutable Maybe<double> m_prev_exec_time;
 #if !__DEPLOY_ON_XP_SP2__
    std::unique_ptr<VarSanityCheck> m_var_sanity_check;
 #endif
    std::unique_ptr<CompNodeSeqRecorder> m_comp_node_seq_recorder;
    NormalExecEnv m_exec_env;
@@ -46,7 +48,7 @@ class ComputingGraphImpl::ComputingSequence final : public AsyncExecutable {
    class ExecContext;
    std::unique_ptr<MegBrainError> m_async_exc;
    std::mutex m_async_exc_mutex;
    MGB_MUTEX m_async_exc_mutex;
    /*!
     * \brief check whether recording comp seq is enabled
--- a/src/core/impl/graph/operator_node.cpp
+++ b/src/core/impl/graph/operator_node.cpp
@@ -713,7 +713,9 @@ void PostExecActions::perform() {
    for (auto&& i : m_items) {
        if (enable) {
 #if !__DEPLOY_ON_XP_SP2__
            VarSanityCheck::check_var_after_exec(i.var, *i.recv_info);
 #endif
            if (i.shape_sync_hdl)
                i.shape_sync_hdl->sync_from_var();
--- a/src/core/impl/graph/static_infer_impl.cpp
+++ b/src/core/impl/graph/static_infer_impl.cpp
@@ -141,7 +141,11 @@ MGB_DEFINE_CLS_WITH_SUPER(StaticInferManagerImpl::TagConstShapeTrait final,
            TagTraitBase) //  {
    struct InferResultCache {
        Spinlock mtx;
 #if __DEPLOY_ON_XP_SP2__
        ThinHashMap<size_t, InpElement> storage;
 #else
        ThinHashMap<std::thread::id, InpElement> storage;
 #endif
    };
    static TagTraitArray sm_empty_deps;
    static InferResultCache sm_result_cache;
@@ -167,7 +171,11 @@ MGB_DEFINE_CLS_WITH_SUPER(StaticInferManagerImpl::TagConstShapeTrait final,
            {
                // thread_local not supported on ios; so we us a manual impl
                MGB_LOCK_GUARD(sm_result_cache.mtx);
 #if __DEPLOY_ON_XP_SP2__
                ret = &sm_result_cache.storage[0];
 #else
                ret = &sm_result_cache.storage[std::this_thread::get_id()];
 #endif
            }
            ret->m_shape = &tag()->shape();
            return ret;
--- a/src/core/impl/graph/static_infer_impl.h
+++ b/src/core/impl/graph/static_infer_impl.h
@@ -122,7 +122,7 @@ class StaticInferManagerImpl final: public StaticInferManager {
        struct TagTraitContainer;
        ComputingGraph * const m_owner_graph;
        std::recursive_mutex m_mtx;
        MGB_RECURSIVE_MUTEX m_mtx;
        //! callbacks to be invoked in destructor
        ThinHashMap<void*, thin_function<void()>> m_dtor_callbacks;
--- a/src/core/impl/graph/var_node.cpp
+++ b/src/core/impl/graph/var_node.cpp
@@ -20,7 +20,7 @@ using namespace cg;
 /* ===================== MemAllocPlan =====================  */
 std::mutex MemAllocPlan::ReadonlyFwdList::list_mutex;
 MGB_MUTEX MemAllocPlan::ReadonlyFwdList::list_mutex;
 void MemAllocPlan::ReadonlyFwdList::reset() {
    MGB_LOCK_GUARD(list_mutex);
--- a/src/core/impl/graph/var_node_mem_mgr.h
+++ b/src/core/impl/graph/var_node_mem_mgr.h
@@ -440,7 +440,7 @@ class VarNodeMemManager {
        ImpureMemPlanManager m_impure_mem_plan_mgr;
        std::mutex m_dynamic_alloc_mtx;
        MGB_MUTEX m_dynamic_alloc_mtx;
        const size_t* m_run_id_ptr = nullptr;
        SyncableCounter m_cpu_async_release_barrier;
--- a/src/core/impl/system.cpp
+++ b/src/core/impl/system.cpp
@@ -19,7 +19,13 @@ using namespace mgb;
 using namespace sys;
 int sys::get_cpu_count() {
 #if __DEPLOY_ON_XP_SP2__
    //! when deploy on xp sp2, we only support single thread
    //! so just return 1 even cpu number greater than 1
    return 1;
 #else
    return std::max(std::thread::hardware_concurrency(), 1u);
 #endif
 }
 #if defined(WIN32)
@@ -153,9 +159,11 @@ bool sys::stderr_ansi_color() {
 void sys::set_thread_name(const std::string &) {
 }
 #if !__DEPLOY_ON_XP_SP2__
 std::string sys::get_thread_name(Maybe<std::thread::id>) {
    return "@";
 }
 #endif
 namespace {
    class FakeTimedFuncInvoker final: public TimedFuncInvoker {
@@ -254,6 +262,7 @@ void sys::set_thread_name(const std::string &name) {
 #endif
 }
 #if !__DEPLOY_ON_XP_SP2__
 std::string sys::get_thread_name(Maybe<std::thread::id> tid_) {
 #if MGB_ENABLE_DEBUG_UTIL
    MGB_LOCK_GUARD(thread_name_map_lock);
@@ -269,10 +278,11 @@ std::string sys::get_thread_name(Maybe<std::thread::id> tid_) {
    return "";
 #endif
 }
 #endif
 namespace {
 class TimedFuncInvokerImpl final: public TimedFuncInvoker {
 class TimedFuncInvokerImpl final : public TimedFuncInvoker {
    /*
     * server-client protocol:
     *
@@ -308,7 +318,7 @@ class TimedFuncInvokerImpl final: public TimedFuncInvoker {
    bool m_watcher_should_stop = false;
    std::condition_variable m_watcher_stop_cv;
    std::mutex m_watcher_stop_mtx, m_global_mtx;
    MGB_MUTEX m_watcher_stop_mtx, m_global_mtx;
    void clear_sock_fd() {
        if (m_peer_fd)
@@ -567,8 +577,10 @@ class TimedFuncInvokerImpl final: public TimedFuncInvoker {
        auto start = high_resolution_clock::now(),
             end = start + timeout_due;
        for (; ; ) {
 #if !__DEPLOY_ON_XP_SP2__
            std::unique_lock<std::mutex> lk(m_watcher_stop_mtx);
            m_watcher_stop_cv.wait_until(lk, end);
 #endif
            if (m_watcher_should_stop)
                return false;
@@ -603,10 +615,9 @@ class TimedFuncInvokerImpl final: public TimedFuncInvoker {
            } MGB_CATCH(..., {});
            clear_sock_fd();
        }
 };
 } // anonymous namespace
 }  // anonymous namespace
 TimedFuncInvoker& TimedFuncInvoker::ins() {
    static TimedFuncInvokerImpl impl;
--- a/src/core/include/megbrain/common.h
+++ b/src/core/include/megbrain/common.h
@@ -205,6 +205,21 @@ void __log__(LogLevel level, const char *file, const char *func, int line,
 #define MGB_TOKENPASTE2(x, y) MGB_TOKENPASTE(x, y)
 #define MGB_LOCK_GUARD_CTOR(mtx) MGB_TOKENPASTE2(__lock_guard_, __LINE__)(mtx)
 #if __DEPLOY_ON_XP_SP2__
 //! refer to
 //! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160
 //! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not
 //! implement some base apis for c++ std function, for example,
 //! std::mutex/std::thread/std::condition_variable as a workround, we will
 //! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc!
 #define MGB_MUTEX size_t
 #define MGB_RECURSIVE_MUTEX size_t
 #define MGB_LOCK_GUARD(mtx) MGB_MARK_USED_VAR(mtx)
 #define MGB_LOCK_GUARD_UNIQUE(mtx) MGB_MARK_USED_VAR(mtx)
 #define MGB_LOCK_GUARD_SHARED(mtx) MGB_MARK_USED_VAR(MGB_MARK_USED_VAR)
 #else
 #define MGB_MUTEX std::mutex
 #define MGB_RECURSIVE_MUTEX std::recursive_mutex
 #define MGB_LOCK_GUARD(mtx) \
    std::lock_guard<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
@@ -212,7 +227,8 @@ void __log__(LogLevel level, const char *file, const char *func, int line,
    std::unique_lock<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
 #define MGB_LOCK_GUARD_SHARED(mtx) \
 	std::shared_lock<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
    std::shared_lock<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
 #endif
 /*!
 * \brief printf-like std::string constructor
--- a/src/core/include/megbrain/graph/var_node.h
+++ b/src/core/include/megbrain/graph/var_node.h
@@ -222,7 +222,7 @@ class MemAllocPlan final: public json::Serializable, public NonCopyableObj {
    private:
        class ReadonlyFwdList {
            MemAllocPlan *m_prev = nullptr, *m_next = nullptr;
            static std::mutex list_mutex;
            static MGB_MUTEX list_mutex;
        public:
            MemAllocPlan* next() const { return m_next; }
            void reset();
--- a/src/core/include/megbrain/system.h
+++ b/src/core/include/megbrain/system.h
@@ -27,11 +27,13 @@ namespace sys {
    //! set name of caller thread
    void set_thread_name(const std::string &name);
 #if !__DEPLOY_ON_XP_SP2__
    /*!
     * \brief get name of of given thread
     * \param tid thread id, or None to for the caller thread
     */
    std::string get_thread_name(Maybe<std::thread::id> tid = None);
 #endif
    //! get number of CPU cores on this system
    int get_cpu_count();
--- a/src/core/include/megbrain/utils/async_worker_impl_0.h
+++ b/src/core/include/megbrain/utils/async_worker_impl_0.h
@@ -35,14 +35,20 @@ class AsyncWorkerSet final: public NonCopyableObj {
 };
 class FutureThreadPoolBase : public NonCopyableObj {
 #if !__DEPLOY_ON_XP_SP2__
    std::vector<std::thread::id> m_ids;
 #endif
    public:
        FutureThreadPoolBase(const Maybe<std::string>& = None) {}
 #if __DEPLOY_ON_XP_SP2__
        size_t start(size_t concurrency) { return concurrency; }
 #else
        const std::vector<std::thread::id>& start(size_t concurrency) {
            m_ids.resize(concurrency, std::this_thread::get_id());
            return m_ids;
        }
 #endif
        void stop() {
        }
--- a/src/core/include/megbrain/utils/event.h
+++ b/src/core/include/megbrain/utils/event.h
@@ -53,7 +53,7 @@ class SyncEventConnecter: public NonCopyableObj {
    using ReceiverMap = ThinHashMap<Typeinfo*, ReceiverList>;
    bool m_is_empty = true;
    std::mutex m_mtx;
    MGB_MUTEX m_mtx;
    //! map from type to receiver; use shared_ptr because it would be kept by
    //! handlers
    std::shared_ptr<ReceiverMap> m_receiver_map =
--- a/src/core/include/megbrain/utils/persistent_cache.h
+++ b/src/core/include/megbrain/utils/persistent_cache.h
@@ -83,7 +83,7 @@ namespace mgb {
                std::string,
                std::unordered_map<BlobStorage, BlobStorage, BlobStorage::Hash>>
                m_cache;
        std::mutex m_mtx;
        MGB_MUTEX m_mtx;
    };
    /*!
--- a/src/opr/impl/basic_arith.cpp
+++ b/src/opr/impl/basic_arith.cpp
@@ -33,7 +33,7 @@ namespace {
    template<class Opr>
    class StaticInferOpr {
        intl::UniqPtrWithCN<Opr> m_opr;
        std::mutex m_mtx;
        MGB_MUTEX m_mtx;
        public:
            class Lock {
@@ -43,7 +43,9 @@ namespace {
                explicit Lock(StaticInferOpr *owner):
                    m_owner{owner}
                {
 #if !__DEPLOY_ON_XP_SP2__
                    m_owner->m_mtx.lock();
 #endif
                }
                public:
@@ -54,8 +56,10 @@ namespace {
                    }
                    ~Lock() {
 #if !__DEPLOY_ON_XP_SP2__
                        if (m_owner)
                            m_owner->m_mtx.unlock();
 #endif
                    }
                    Lock& operator = (const Lock &) = delete;
--- a/src/opr/impl/internal/indexing_helper.cpp
+++ b/src/opr/impl/internal/indexing_helper.cpp
@@ -277,7 +277,7 @@ SubTensorSpec FancyIndexingHelper::fancy_indexing_make_sub_spec(
    mgb_assert(m_require_scalar_index || !fake_single_idx);
    static DeviceTensorND fake_val;
    static std::mutex fake_val_mtx;
    static MGB_MUTEX fake_val_mtx;
    if (mgb_unlikely(fake_val.empty())) {
        MGB_LOCK_GUARD(fake_val_mtx);
--- a/src/opr/impl/internal/megdnn_opr_wrapper.cpp
+++ b/src/opr/impl/internal/megdnn_opr_wrapper.cpp
@@ -53,7 +53,7 @@ namespace {
        MGB_TYPEINFO_OBJ_DECL;
        public:
            std::mutex mtx;
            MGB_MUTEX mtx;
            CompNode::UnorderedMap<DeviceTensorStorage> cn2storage;
    };
    MGB_TYPEINFO_OBJ_IMPL(TempStorageContainer);
--- a/src/opr/impl/io.cpp
+++ b/src/opr/impl/io.cpp
@@ -377,7 +377,7 @@ MGB_DYN_TYPE_OBJ_FINAL_IMPL(SharedDeviceTensorWithFormat);
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(ImmutableTensor);
 class ImmutableTensor::Value {
    std::mutex m_mtx;
    MGB_MUTEX m_mtx;
    DeviceTensorND m_dev, m_static_infer;
    std::string m_summary;
@@ -527,7 +527,7 @@ class ImmutableTensor::DevValueCache final: public UserDataContainer::UserData {
    std::unordered_map<TensorKey, Value, Hash> m_tensor2val;
    std::unordered_map<ScalarKey, Value, Hash> m_scalar2val;
    std::mutex m_mtx;
    MGB_MUTEX m_mtx;
    void setup_value(Value &dest, const HostTensorND &val) {
        dest.setup(m_comp_node, val);
--- a/src/opr/impl/loop/impl.cpp
+++ b/src/opr/impl/loop/impl.cpp
@@ -888,7 +888,7 @@ class LoopImpl::MutableStateSaver::Recorder final: public NonCopyableObj {
    //! mutex for m_saved_buckets, used between copy_bucket_to_host() and the
    //! async copy task in m_copy_threadpool
    std::mutex m_saved_buckets_mtx;
    MGB_MUTEX m_saved_buckets_mtx;
    //! see on_fwd_finish()
    TensorShape m_var_shape;
    bool m_enabled = false;
--- a/src/opr/impl/search_policy/profiler.cpp
+++ b/src/opr/impl/search_policy/profiler.cpp
@@ -356,7 +356,9 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
            next_report_time = timer.get_secs() + 1;
        }
        using namespace std::literals;
 #if !__DEPLOY_ON_XP_SP2__
        std::this_thread::sleep_for(1000us);
 #endif
    }
    // release all free blocks owned by child process,
    // in order to avoid main process running out of memory
--- a/src/opr/impl/utility.cpp
+++ b/src/opr/impl/utility.cpp
@@ -731,7 +731,7 @@ class PersistentOutputStorage::StorageHolder final
                                     key.second);
        }
    };
    std::mutex m_mtx;
    MGB_MUTEX m_mtx;
    std::unordered_map<Key, DeviceTensorStorage, KeyHash> m_storage;
 public:
--- a/src/plugin/impl/var_value_checker.cpp
+++ b/src/plugin/impl/var_value_checker.cpp
@@ -125,9 +125,13 @@ void VarValueChecker::on_var_computed(VarNode *var) {
    }
    if (!m_init_val_dumped) {
 #if !__DEPLOY_ON_XP_SP2__
        m_var2val_mtx.lock();
        auto &&val = m_var2val[var];
 #endif
        auto&& val = m_var2val[var];
 #if !__DEPLOY_ON_XP_SP2__
        m_var2val_mtx.unlock();
 #endif
        mgb_assert(!val);
        val = std::make_shared<DeviceTensorND>();
--- a/src/plugin/include/megbrain/plugin/cpu_dispatch_checker.h
+++ b/src/plugin/include/megbrain/plugin/cpu_dispatch_checker.h
@@ -22,7 +22,7 @@ namespace mgb {
     * This is intended to find potential bugs in megdnn.
     */
    class CPUDispatchChecker final: public PluginBase {
        std::mutex m_cn2nr_task_mtx,
        MGB_MUTEX m_cn2nr_task_mtx,
            m_failed_oprs_mtx_storage,
            *m_failed_oprs_mtx = &m_failed_oprs_mtx_storage;
        CompNode::UnorderedMap<size_t> m_cn2nr_task;
--- a/src/plugin/include/megbrain/plugin/opr_io_dump.h
+++ b/src/plugin/include/megbrain/plugin/opr_io_dump.h
@@ -60,7 +60,7 @@ class TextOprIODump final : public OprIODumpBase {
    bool m_print_addr = true;
    std::shared_ptr<FILE> m_fout;
    size_t m_max_size = 5;
    std::mutex m_mtx;
    MGB_MUTEX m_mtx;
    std::unique_ptr<LazyValueRecorder> m_lazy_value;
    void dump_var(VarNode* var, bool lazy_sync) override;
--- a/src/plugin/include/megbrain/plugin/var_sanity_check.h
+++ b/src/plugin/include/megbrain/plugin/var_sanity_check.h
@@ -64,7 +64,7 @@ class VarSanityCheck final : public PluginBase {
    //! map from caller thread to workspace map
    ThinHashMap<std::thread::id, WorkspaceCache> m_workspace;
    std::mutex m_workspace_mtx;
    MGB_MUTEX m_workspace_mtx;
    ThinHashMap<VarNode*, ChecksumResult> m_var2chksum;
    /*! the ids of varnodes that have been modified by recv_opr
@@ -72,7 +72,7 @@ class VarSanityCheck final : public PluginBase {
     * cg::OperatorNodeBase::NodeProp::Flag:: FORCE_UPDATE_INPUT_VAR.
     */
    ThinHashSet<VarNode*> m_modified_vars;
    std::mutex m_id2chksum_mtx;
    MGB_MUTEX m_id2chksum_mtx;
    typedef void (VarSanityCheck::*input_checker_fn)(cg::OperatorNodeBase*,
                                                     VarNode*);
--- a/src/plugin/include/megbrain/plugin/var_value_checker.h
+++ b/src/plugin/include/megbrain/plugin/var_value_checker.h
@@ -50,7 +50,7 @@ namespace mgb {
        size_t m_cur_var_idx, m_nr_exec;
        VarNodeArray m_vars;
        std::mutex m_var2val_mtx;
        MGB_MUTEX m_var2val_mtx;
        ThinHashMap<VarNode*, std::shared_ptr<DeviceTensorND>> m_var2val;
        Checker m_checker;