diff --git a/dnn/src/common/basic_types.cpp b/dnn/src/common/basic_types.cpp
index 96de41d3..eeee2479 100644
--- a/dnn/src/common/basic_types.cpp
+++ b/dnn/src/common/basic_types.cpp
@@ -60,10 +60,10 @@ T deserialize_pod(const std::string& data, size_t& offset) {
 ErrorHandler* ErrorHandler::sm_inst;
 
 ErrorHandler* ErrorHandler::inst() {
-    static std::mutex mtx;
+    static DNN_MUTEX mtx;
     static DefaultErrorHandler default_handler;
     if (megdnn_unlikely(!sm_inst)) {
-        std::lock_guard<std::mutex> lg{mtx};
+        MEGDNN_LOCK_GUARD(mtx);
         if (!sm_inst) {
             sm_inst = &default_handler;
         }
diff --git a/dnn/src/common/cv/interp_helper.cpp b/dnn/src/common/cv/interp_helper.cpp
index 9506a3f6..d2cc60cd 100644
--- a/dnn/src/common/cv/interp_helper.cpp
+++ b/dnn/src/common/cv/interp_helper.cpp
@@ -145,7 +145,7 @@ init_inter_tab_1d(InterpolationMode imode, float* tab, int tabsz) {
 #if MEGDNN_X86
 DEF_FUN(const int16_t*) get_linear_ic4_table() {
     auto table_holder = &sm_tab_linear;
-    std::lock_guard<std::mutex> lg{table_holder->mtx};
+    MEGDNN_LOCK_GUARD(table_holder->mtx);
     float* tab = nullptr;
     short* itab = nullptr;
     MEGDNN_MARK_USED_VAR(tab);
@@ -175,7 +175,7 @@ DEF_FUN(const void*) get_table(InterpolationMode imode, bool fixpt) {
         default:
             megdnn_throw(("unsupported interpolation mode"));
     }
-    std::lock_guard<std::mutex> lg{table_holder->mtx};
+    MEGDNN_LOCK_GUARD(table_holder->mtx);
 
     float* tab = nullptr;
     short* itab = nullptr;
diff --git a/dnn/src/common/cv/interp_helper.h b/dnn/src/common/cv/interp_helper.h
index c1cf68f0..922a14a4 100644
--- a/dnn/src/common/cv/interp_helper.h
+++ b/dnn/src/common/cv/interp_helper.h
@@ -134,7 +134,7 @@ private:
     };
 
     struct TableHolderBase {
-        std::mutex mtx;
+        DNN_MUTEX mtx;
 
         //! get table pointer; return whether already init
         virtual bool get(float**, int16_t**) = 0;
diff --git a/dnn/src/common/elemwise/opr_impl.cpp b/dnn/src/common/elemwise/opr_impl.cpp
index 52c01490..96eb820d 100644
--- a/dnn/src/common/elemwise/opr_impl.cpp
+++ b/dnn/src/common/elemwise/opr_impl.cpp
@@ -39,10 +39,10 @@ using Mode = param::Elemwise::Mode;
 using ModeTrait = ElemwiseForward::ModeTrait;
 
 const ModeTrait& ModeTrait::from_mode(Mode mode) {
-    static std::mutex mtx;
+    static DNN_MUTEX mtx;
     static std::vector<ModeTrait> traits;
 
-    std::lock_guard<std::mutex> _lock(mtx);
+    MEGDNN_LOCK_GUARD(mtx);
 
     if (traits.empty()) {
         auto get = [&](Mode m) -> ModeTrait& {
diff --git a/dnn/src/common/elemwise_multi_type/opr_impl.cpp b/dnn/src/common/elemwise_multi_type/opr_impl.cpp
index ef1ec392..dd2046a1 100644
--- a/dnn/src/common/elemwise_multi_type/opr_impl.cpp
+++ b/dnn/src/common/elemwise_multi_type/opr_impl.cpp
@@ -28,10 +28,10 @@ void check_dtype(const ModeTrait& trait, size_t i, const TensorLayout& src) {
 }  // anonymous namespace
 
 const ModeTrait& ModeTrait::from_mode(Mode mode) {
-    static std::mutex mtx;
+    static DNN_MUTEX mtx;
     static std::vector<ModeTrait> traits;
 
-    std::lock_guard<std::mutex> _lock(mtx);
+    MEGDNN_LOCK_GUARD(mtx);
 
     auto make_check_dtype_func = [](DType expected) {
         auto func = [expected](DType dtype) {
diff --git a/dnn/src/common/handle_impl.h b/dnn/src/common/handle_impl.h
index 34cf79c0..1f6431f2 100644
--- a/dnn/src/common/handle_impl.h
+++ b/dnn/src/common/handle_impl.h
@@ -70,7 +70,7 @@ protected:
         MIDOUT_BEGIN(dnn_src_common_handle_impl, Opr, idx) {
             static_assert(idx < NR_HELPER_OPRS, "invalid idx");
             if (!self->m_helper_oprs[idx]) {
-                std::lock_guard<std::mutex> lg{self->m_helper_oprs_mtx};
+                MEGDNN_LOCK_GUARD(self->m_helper_oprs_mtx);
                 if (!self->m_helper_oprs[idx]) {
                     self->m_helper_oprs[idx] =
                             self->template create_operator<Opr>();
@@ -88,7 +88,7 @@ protected:
 
 private:
     std::array<std::unique_ptr<OperatorBase>, NR_HELPER_OPRS> m_helper_oprs;
-    std::mutex m_helper_oprs_mtx;
+    DNN_MUTEX m_helper_oprs_mtx;
 };
 
 }  // namespace megdnn
diff --git a/dnn/src/common/opr_delegate.h b/dnn/src/common/opr_delegate.h
index 0be9f4d4..d5e96b4b 100644
--- a/dnn/src/common/opr_delegate.h
+++ b/dnn/src/common/opr_delegate.h
@@ -38,7 +38,7 @@ const std::shared_ptr<Handle>& inplace_cpu_handle(int debug_level = 0);
  */
 template <int nr_opr = 1>
 class CpuOprDelegationStorage {
-    std::mutex m_mtx;
+    DNN_MUTEX m_mtx;
     std::shared_ptr<Handle> m_handle;
     std::unique_ptr<OperatorBase> m_oprs[nr_opr];
 
diff --git a/dnn/src/common/tensor_format.cpp b/dnn/src/common/tensor_format.cpp
index ac4736ad..1b700e94 100644
--- a/dnn/src/common/tensor_format.cpp
+++ b/dnn/src/common/tensor_format.cpp
@@ -604,7 +604,7 @@ TensorLayout LowbitsAlignedTensorFormatBase::collapse_contiguous_spec(
 TensorFormat Image2DPack4TensorFormat::make_raw(
         size_t align_axis, size_t align_size_in_elements,
         Handle::HandleVendorType vendor_type) {
-    static std::mutex mtx;
+    static DNN_MUTEX mtx;
     static std::unordered_map<uint64_t,
                               std::unique_ptr<Image2DPack4TensorFormat>>
             cache;
@@ -641,7 +641,7 @@ TensorFormat Image2DPack4TensorFormat::change_axis(size_t axis) const {
 /* ===================== LowbitsitsAlignedToBytesTensorFormat
  * ===================== */
 TensorFormat LowbitsAlignedToBytesTensorFormat::make(size_t size_nbits) {
-    static std::mutex mtx;
+    static DNN_MUTEX mtx;
     static std::unordered_map<
             uint64_t, std::unique_ptr<LowbitsAlignedToBytesTensorFormat>>
             cache;
diff --git a/dnn/src/common/utils.h b/dnn/src/common/utils.h
index 11077216..452477d9 100644
--- a/dnn/src/common/utils.h
+++ b/dnn/src/common/utils.h
@@ -118,8 +118,17 @@
 #define megdnn_layout_msg(layout) \
     std::string(#layout "=" + (layout).to_string())
 
-#define MEGDNN_LOCK_GUARD(var) \
-    std::lock_guard<std::remove_cv_t<decltype(var)>> _lock_guard_##var { var }
+#if __DEPLOY_ON_XP_SP2__
+#define DNN_MUTEX size_t
+#define MEGDNN_LOCK_GUARD(var) MEGDNN_MARK_USED_VAR(var)
+#else
+#define DNN_MUTEX std::mutex
+#define DNN_TOKENPASTE(x, y) x##y
+#define DNN_TOKENPASTE2(x, y) DNN_TOKENPASTE(x, y)
+#define DNN_LOCK_GUARD_CTOR(mtx) DNN_TOKENPASTE2(__lock_guard_, __LINE__)(mtx)
+#define MEGDNN_LOCK_GUARD(mtx) \
+    std::lock_guard<decltype(mtx)> DNN_LOCK_GUARD_CTOR(mtx)
+#endif
 
 namespace megdnn {
 
@@ -487,7 +496,7 @@ struct _SafeMultipliesImplUnsigned : public std::binary_function<T, T, T> {
                 "implicit conversion disallowed in SafeMultiplies");
         megdnn_trap();
     }
-};
+};  // namespace megdnn
 
 template <>
 struct SafeMultiplies<size_t> : public _SafeMultipliesImplUnsigned<size_t> {};
diff --git a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h
index 11ab21aa..8e52a7ec 100644
--- a/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h
+++ b/dnn/src/fallback/conv_bias/conv1x1/conv1x1_utils.h
@@ -81,7 +81,7 @@ public:
     }
 
 private:
-    std::mutex m_mtx;
+    DNN_MUTEX m_mtx;
     std::unordered_map<StrategyHashKey, std::unique_ptr<T>, StrategyHasher,
                        StrategyHashKeyEqual>
             m_map_strategies;
@@ -99,4 +99,4 @@ MatrixMulImpl::KernSizeParam get_matmul_kern_param(
 }  // namespace fallback
 }  // namespace megdnn
 
-// vim: syntax=cpp.doxygen
\ No newline at end of file
+// vim: syntax=cpp.doxygen
diff --git a/dnn/src/fallback/conv_bias/im2col/factory.h b/dnn/src/fallback/conv_bias/im2col/factory.h
index f7daef77..bbaa0de2 100644
--- a/dnn/src/fallback/conv_bias/im2col/factory.h
+++ b/dnn/src/fallback/conv_bias/im2col/factory.h
@@ -110,7 +110,7 @@ struct StrategyHashParamEqual {
 };
 
 class StrategyDelegationStorage {
-    std::mutex m_mtx;
+    DNN_MUTEX m_mtx;
     std::unordered_map<StrategyHashParam, std::unique_ptr<StrategyBase>,
                        StrategyHashParamHash, StrategyHashParamEqual>
             map_strategys;
diff --git a/dnn/src/naive/sleep/opr_impl.cpp b/dnn/src/naive/sleep/opr_impl.cpp
index db1c167d..ca1ebb60 100644
--- a/dnn/src/naive/sleep/opr_impl.cpp
+++ b/dnn/src/naive/sleep/opr_impl.cpp
@@ -11,6 +11,10 @@
 
 #include "./opr_impl.h"
 
+#if __DEPLOY_ON_XP_SP2__
+#define MEGDNN_NO_THREAD 1
+#endif
+
 #include "src/naive/handle.h"
 #if !MEGDNN_NO_THREAD
 #include <thread>
@@ -20,10 +24,10 @@ namespace megdnn {
 namespace naive {
 
 void SleepForwardImpl::exec() {
-    double seconds = m_param.time;
 #if MEGDNN_NO_THREAD
     megdnn_trap();
 #else
+    double seconds = m_param.time;
     MEGDNN_DISPATCH_CPU_KERN_OPR(
             std::this_thread::sleep_for(std::chrono::microseconds(
                     static_cast<uint64_t>(seconds * 1e6))););
diff --git a/sdk/load-and-run/src/infile_persistent_cache.h b/sdk/load-and-run/src/infile_persistent_cache.h
index 33fcc2c2..d9dc5bf0 100644
--- a/sdk/load-and-run/src/infile_persistent_cache.h
+++ b/sdk/load-and-run/src/infile_persistent_cache.h
@@ -52,7 +52,7 @@ class InFilePersistentCache final : public PersistentCache {
     std::unordered_map<std::string, std::unordered_map<BlobStorage, BlobStorage,
                                                        BlobStorage::Hash>>
             m_cache;
-    std::mutex m_mtx;
+    MGB_MUTEX m_mtx;
 
     template <typename Input>
     void read_cache(Input& inp);
diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp
index d4bad933..cca03b65 100644
--- a/src/core/impl/comp_node/comp_node.cpp
+++ b/src/core/impl/comp_node/comp_node.cpp
@@ -32,7 +32,7 @@ namespace {
     std::atomic_flag
         g_default_cpu_initialized,
         g_exit_handler_registered[CompNode::NR_DEVICE_TYPE];
-    std::mutex g_device_map_mtx;
+    MGB_MUTEX g_device_map_mtx;
     ThinHashMap<CompNode::DeviceType, ThinHashMap<int, int>> g_device_map;
     CompNode::DeviceType g_unspec_locator_type;
 
diff --git a/src/core/impl/comp_node/cpu/comp_node.cpp b/src/core/impl/comp_node/cpu/comp_node.cpp
index 92633e15..0ac86e7d 100644
--- a/src/core/impl/comp_node/cpu/comp_node.cpp
+++ b/src/core/impl/comp_node/cpu/comp_node.cpp
@@ -60,7 +60,11 @@ class CpuCompNode::WorkerQueue final
             sys::set_cpu_affinity({m_locator.device});
 #endif
         }
+#if __DEPLOY_ON_XP_SP2__
+        __builtin_trap();
+#else
         sys::set_thread_name(m_locator.to_string());
+#endif
     }
 
     void on_sync_all_task_finish() override {
@@ -830,7 +834,9 @@ struct CpuCompNode::Pool {
         void operator()(CompNodeRecorderImpl* p) { p->~CompNodeRecorderImpl(); }
     };
 
+#if !__DEPLOY_ON_XP_SP2__
     std::recursive_mutex mtx;
+#endif
     // use global memory pool to ensuare object memory accessible even after
     // global finalize
     std::aligned_storage_t<sizeof(CompNodeRecorderImpl),
@@ -862,7 +868,9 @@ void CpuCompNode::foreach (thin_function<void(CompNode)> callback) {
     for (size_t i = 0;; ++i) {
         CompNode cur;
         {
+#if !__DEPLOY_ON_XP_SP2__
             MGB_LOCK_GUARD(sm_pool->mtx);
+#endif
             if (i >= sm_pool->nr_used_impl_storage)
                 return;
             cur = make_comp_node_from_impl(
@@ -909,7 +917,9 @@ CpuCompNode::Impl* CpuCompNode::load_cpu(Locator locator,
                        locator.device == Locator::DEVICE_MULTITHREAD_DEFAULT,
                "failed to load cpu for device:%d stream:%d", locator.device,
                locator.stream);
+#if !__DEPLOY_ON_XP_SP2__
     MGB_LOCK_GUARD(sm_pool->mtx);
+#endif
 
     // encode both device ID and type into a int
     mgb_assert(locator_logical.device >= -1 ||
@@ -967,7 +977,9 @@ void CpuCompNode::sync_all() {
     if (!sm_pool)
         return;
 
+#if !__DEPLOY_ON_XP_SP2__
     MGB_LOCK_GUARD(sm_pool->mtx);
+#endif
     for (auto&& i : sm_pool->locator2impl)
         i.second->sync();
     for (auto&& i : sm_pool->locator2impl_multi_thread)
@@ -1049,7 +1061,9 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::do_device_wait_by(
 
     auto waiter = [this, version]() {
         while (m_record_nr_finish.load(std::memory_order_acquire) < version) {
+#if !__DEPLOY_ON_XP_SP2__
             std::unique_lock<std::mutex> lk{m_dev_wait_mtx};
+#endif
             if (m_record_nr_finish.load(std::memory_order_acquire) >= version) {
                 break;
             }
@@ -1078,10 +1092,12 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::on_finish() {
     }
 
     m_record_nr_finish.fetch_add(1, std::memory_order_release);
+#if !__DEPLOY_ON_XP_SP2__
     if (m_dev_wait_nr_waiter.load(std::memory_order_acquire)) {
         MGB_LOCK_GUARD(m_dev_wait_mtx);
         m_dev_wait_cv.notify_all();
     }
+#endif
 }
 
 bool CpuCompNode::CpuDispatchableBase::EventImpl::do_finished() {
@@ -1100,11 +1116,15 @@ void CpuCompNode::CpuDispatchableBase::EventImpl::host_wait_cv() {
 
     m_dev_wait_nr_waiter.fetch_add(1, std::memory_order_release);
     for (;;) {
+#if !__DEPLOY_ON_XP_SP2__
         std::unique_lock<std::mutex> lock{m_dev_wait_mtx};
+#endif
         if (finished()) {
             break;
         }
+#if !__DEPLOY_ON_XP_SP2__
         m_dev_wait_cv.wait(lock);
+#endif
     }
     m_dev_wait_nr_waiter.fetch_sub(1, std::memory_order_release);
 }
diff --git a/src/core/impl/comp_node/impl_helper.cpp b/src/core/impl/comp_node/impl_helper.cpp
index 8101c97b..585a83e1 100644
--- a/src/core/impl/comp_node/impl_helper.cpp
+++ b/src/core/impl/comp_node/impl_helper.cpp
@@ -45,9 +45,17 @@ void CompNodeImplHelper::EventImplHelper::host_wait() {
         return;
     }
     if (sm_cpu_sync_level >= 1) {
+#if __DEPLOY_ON_XP_SP2__
+#if MGB_HAVE_THREAD
+        __builtin_trap();
+#else
+        return;
+#endif
+#else
         while (!finished()) {
             std::this_thread::yield();
         }
+#endif
         return;
     }
     mgb_assert(!sm_cpu_sync_level, "invalid cpu sync level: %d",
@@ -57,9 +65,17 @@ void CompNodeImplHelper::EventImplHelper::host_wait() {
 }
 
 void CompNodeImplHelper::EventImplHelper::host_wait_cv() {
+#if __DEPLOY_ON_XP_SP2__
+#if MGB_HAVE_THREAD
+    __builtin_trap();
+#else
+    return;
+#endif
+#else
     while (!finished()) {
         std::this_thread::yield();
     }
+#endif
 }
 
 double CompNodeImplHelper::EventImplHelper::elapsed_time_until(Event& end_) {
diff --git a/src/core/impl/comp_node/impl_helper.h b/src/core/impl/comp_node/impl_helper.h
index a1d4f1e6..62df1da9 100644
--- a/src/core/impl/comp_node/impl_helper.h
+++ b/src/core/impl/comp_node/impl_helper.h
@@ -49,7 +49,7 @@ namespace mgb {
      * been performed.
      */
     class CompNodeImplHelper::EventImplHelper: public Event {
-        std::mutex m_mtx;
+        MGB_MUTEX m_mtx;
 
         bool m_recorded = false, m_finished = false;
 
diff --git a/src/core/impl/comp_node/mem_alloc/impl.cpp b/src/core/impl/comp_node/mem_alloc/impl.cpp
index 88c21225..75a8cfad 100644
--- a/src/core/impl/comp_node/mem_alloc/impl.cpp
+++ b/src/core/impl/comp_node/mem_alloc/impl.cpp
@@ -59,11 +59,15 @@ MemAllocImplHelper::MemAddr MemAllocImplHelper::do_alloc(
         size_t size, bool allow_from_parent, bool log_stat_on_error) {
 
     mgb_assert(size);
+#if !__DEPLOY_ON_XP_SP2__
     m_mutex.lock();
+#endif
 
     auto iter = m_free_blk_size.lower_bound(FreeBlock{MemAddr{0, 0}, size});
     if (iter == m_free_blk_size.end()) {
+#if !__DEPLOY_ON_XP_SP2__
         m_mutex.unlock();
+#endif
         if (!allow_from_parent) {
             if (log_stat_on_error) {
                 print_memory_state();
@@ -87,7 +91,9 @@ MemAllocImplHelper::MemAddr MemAllocImplHelper::do_alloc(
     if (remain)
         insert_free_unsafe({alloc_addr + size, remain});
 
+#if !__DEPLOY_ON_XP_SP2__
     m_mutex.unlock();
+#endif
     return alloc_addr;
 }
 
@@ -267,7 +273,9 @@ MemAllocImplHelper::MemAddr DevMemAllocImpl::alloc_from_parent(size_t size) {
             {
                 // sleep to wait for async dealloc
                 using namespace std::literals;
+#if !__DEPLOY_ON_XP_SP2__
                 std::this_thread::sleep_for(0.2s);
+#endif
             }
             get = gather_stream_free_blk_and_release_full();
             mgb_log("device %d: sync all device and try to "
diff --git a/src/core/impl/comp_node/mem_alloc/impl.h b/src/core/impl/comp_node/mem_alloc/impl.h
index 11e1de00..e5c08abe 100644
--- a/src/core/impl/comp_node/mem_alloc/impl.h
+++ b/src/core/impl/comp_node/mem_alloc/impl.h
@@ -73,7 +73,7 @@ class MemAllocImplHelper: virtual public MemAllocBase {
         //! map from address to size and size iter
         std::map<size_t, FreeBlockAddrInfo> m_free_blk_addr;
 
-        std::mutex m_mutex;
+        MGB_MUTEX m_mutex;
 
         struct BlkByAddrIter {
             decltype(m_free_blk_addr.begin()) aiter;
diff --git a/src/core/impl/graph/cg_impl_seq.cpp b/src/core/impl/graph/cg_impl_seq.cpp
index c59604f4..1d5444d6 100644
--- a/src/core/impl/graph/cg_impl_seq.cpp
+++ b/src/core/impl/graph/cg_impl_seq.cpp
@@ -48,7 +48,11 @@ class ComputingGraphImpl::ComputingSequence::ExecContext {
     std::unique_ptr<CompNodeSeqRecorder> m_recorder;
 
     bool has_var_sanity_check() const {
+#if __DEPLOY_ON_XP_SP2__
+        return false;
+#else
         return static_cast<bool>(m_comp_seq->m_var_sanity_check);
+#endif
     }
 
     void try_reset_recorder() {
@@ -305,10 +309,12 @@ void ComputingGraphImpl::ComputingSequence::preprocess(ExecContext* ctx) {
             m_owner_graph->var_node_mem_manager().alloc_var_node_mem_static();
 
     bool first_exec = m_first_exec;
+#if !__DEPLOY_ON_XP_SP2__
     if (!first_exec) {
         // var sanity check only for first run
         m_var_sanity_check.reset();
     }
+#endif
 
     m_owner_graph->event().signal_inplace<event::CompSeqExecBeforeStart>(
             m_owner_graph, this, &ctx->m_cleanup_callback, &m_used_comp_node,
@@ -342,9 +348,13 @@ void ComputingGraphImpl::ComputingSequence::attach_to_graph() {
                 static_cast<ComputingSequence*>(gimpl->m_current_comp_seq);
         prev_seq->cleanup();
     }
+#if !__DEPLOY_ON_XP_SP2__
+    //! disable VarSanityCheck when __DEPLOY_ON_XP_SP2__=1. caused by
+    //! VarSanityCheck depends on std::thread
     if (gimpl->options().var_sanity_check_first_run) {
         m_var_sanity_check = std::make_unique<VarSanityCheck>(gimpl);
     }
+#endif
     gimpl->m_current_comp_seq = this;
 }
 
@@ -403,7 +413,9 @@ void ComputingGraphImpl::ComputingSequence::do_wait(bool explicit_user_wait) {
 }
 
 void ComputingGraphImpl::ComputingSequence::cleanup() {
+#if !__DEPLOY_ON_XP_SP2__
     m_var_sanity_check.reset();
+#endif
     if (has_uncaught_exception()) {
         mgb_log_warn(
                 "fallback to simple graph waiting in dtor due to uncaught "
diff --git a/src/core/impl/graph/cg_impl_seq.h b/src/core/impl/graph/cg_impl_seq.h
index 47818a6f..f13e50e9 100644
--- a/src/core/impl/graph/cg_impl_seq.h
+++ b/src/core/impl/graph/cg_impl_seq.h
@@ -30,7 +30,9 @@ class ComputingGraphImpl::ComputingSequence final : public AsyncExecutable {
     size_t m_run_id = 0;
     size_t m_cg_event_version = 0;
     mutable Maybe<double> m_prev_exec_time;
+#if !__DEPLOY_ON_XP_SP2__
     std::unique_ptr<VarSanityCheck> m_var_sanity_check;
+#endif
     std::unique_ptr<CompNodeSeqRecorder> m_comp_node_seq_recorder;
 
     NormalExecEnv m_exec_env;
@@ -46,7 +48,7 @@ class ComputingGraphImpl::ComputingSequence final : public AsyncExecutable {
     class ExecContext;
 
     std::unique_ptr<MegBrainError> m_async_exc;
-    std::mutex m_async_exc_mutex;
+    MGB_MUTEX m_async_exc_mutex;
 
     /*!
      * \brief check whether recording comp seq is enabled
diff --git a/src/core/impl/graph/operator_node.cpp b/src/core/impl/graph/operator_node.cpp
index 202e5876..d5bc5502 100644
--- a/src/core/impl/graph/operator_node.cpp
+++ b/src/core/impl/graph/operator_node.cpp
@@ -713,7 +713,9 @@ void PostExecActions::perform() {
 
     for (auto&& i : m_items) {
         if (enable) {
+#if !__DEPLOY_ON_XP_SP2__
             VarSanityCheck::check_var_after_exec(i.var, *i.recv_info);
+#endif
 
             if (i.shape_sync_hdl)
                 i.shape_sync_hdl->sync_from_var();
diff --git a/src/core/impl/graph/static_infer_impl.cpp b/src/core/impl/graph/static_infer_impl.cpp
index 908e9318..9562b58c 100644
--- a/src/core/impl/graph/static_infer_impl.cpp
+++ b/src/core/impl/graph/static_infer_impl.cpp
@@ -141,7 +141,11 @@ MGB_DEFINE_CLS_WITH_SUPER(StaticInferManagerImpl::TagConstShapeTrait final,
             TagTraitBase) //  {
     struct InferResultCache {
         Spinlock mtx;
+#if __DEPLOY_ON_XP_SP2__
+        ThinHashMap<size_t, InpElement> storage;
+#else
         ThinHashMap<std::thread::id, InpElement> storage;
+#endif
     };
     static TagTraitArray sm_empty_deps;
     static InferResultCache sm_result_cache;
@@ -167,7 +171,11 @@ MGB_DEFINE_CLS_WITH_SUPER(StaticInferManagerImpl::TagConstShapeTrait final,
             {
                 // thread_local not supported on ios; so we us a manual impl
                 MGB_LOCK_GUARD(sm_result_cache.mtx);
+#if __DEPLOY_ON_XP_SP2__
+                ret = &sm_result_cache.storage[0];
+#else
                 ret = &sm_result_cache.storage[std::this_thread::get_id()];
+#endif
             }
             ret->m_shape = &tag()->shape();
             return ret;
diff --git a/src/core/impl/graph/static_infer_impl.h b/src/core/impl/graph/static_infer_impl.h
index 7594851f..f2af0e2c 100644
--- a/src/core/impl/graph/static_infer_impl.h
+++ b/src/core/impl/graph/static_infer_impl.h
@@ -122,7 +122,7 @@ class StaticInferManagerImpl final: public StaticInferManager {
         struct TagTraitContainer;
 
         ComputingGraph * const m_owner_graph;
-        std::recursive_mutex m_mtx;
+        MGB_RECURSIVE_MUTEX m_mtx;
 
         //! callbacks to be invoked in destructor
         ThinHashMap<void*, thin_function<void()>> m_dtor_callbacks;
diff --git a/src/core/impl/graph/var_node.cpp b/src/core/impl/graph/var_node.cpp
index 4a2ab692..596313c6 100644
--- a/src/core/impl/graph/var_node.cpp
+++ b/src/core/impl/graph/var_node.cpp
@@ -20,7 +20,7 @@ using namespace cg;
 
 /* ===================== MemAllocPlan =====================  */
 
-std::mutex MemAllocPlan::ReadonlyFwdList::list_mutex;
+MGB_MUTEX MemAllocPlan::ReadonlyFwdList::list_mutex;
 
 void MemAllocPlan::ReadonlyFwdList::reset() {
     MGB_LOCK_GUARD(list_mutex);
diff --git a/src/core/impl/graph/var_node_mem_mgr.h b/src/core/impl/graph/var_node_mem_mgr.h
index be69da17..d657a0b9 100644
--- a/src/core/impl/graph/var_node_mem_mgr.h
+++ b/src/core/impl/graph/var_node_mem_mgr.h
@@ -440,7 +440,7 @@ class VarNodeMemManager {
 
         ImpureMemPlanManager m_impure_mem_plan_mgr;
 
-        std::mutex m_dynamic_alloc_mtx;
+        MGB_MUTEX m_dynamic_alloc_mtx;
         const size_t* m_run_id_ptr = nullptr;
 
         SyncableCounter m_cpu_async_release_barrier;
diff --git a/src/core/impl/system.cpp b/src/core/impl/system.cpp
index a9e695d8..8673d6a7 100644
--- a/src/core/impl/system.cpp
+++ b/src/core/impl/system.cpp
@@ -19,7 +19,13 @@ using namespace mgb;
 using namespace sys;
 
 int sys::get_cpu_count() {
+#if __DEPLOY_ON_XP_SP2__
+    //! when deploy on xp sp2, we only support single thread
+    //! so just return 1 even cpu number greater than 1
+    return 1;
+#else
     return std::max(std::thread::hardware_concurrency(), 1u);
+#endif
 }
 
 #if defined(WIN32)
@@ -153,9 +159,11 @@ bool sys::stderr_ansi_color() {
 void sys::set_thread_name(const std::string &) {
 }
 
+#if !__DEPLOY_ON_XP_SP2__
 std::string sys::get_thread_name(Maybe<std::thread::id>) {
     return "@";
 }
+#endif
 
 namespace {
     class FakeTimedFuncInvoker final: public TimedFuncInvoker {
@@ -254,6 +262,7 @@ void sys::set_thread_name(const std::string &name) {
 #endif
 }
 
+#if !__DEPLOY_ON_XP_SP2__
 std::string sys::get_thread_name(Maybe<std::thread::id> tid_) {
 #if MGB_ENABLE_DEBUG_UTIL
     MGB_LOCK_GUARD(thread_name_map_lock);
@@ -269,10 +278,11 @@ std::string sys::get_thread_name(Maybe<std::thread::id> tid_) {
     return "";
 #endif
 }
+#endif
 
 namespace {
 
-class TimedFuncInvokerImpl final: public TimedFuncInvoker {
+class TimedFuncInvokerImpl final : public TimedFuncInvoker {
     /*
      * server-client protocol:
      *
@@ -308,7 +318,7 @@ class TimedFuncInvokerImpl final: public TimedFuncInvoker {
 
     bool m_watcher_should_stop = false;
     std::condition_variable m_watcher_stop_cv;
-    std::mutex m_watcher_stop_mtx, m_global_mtx;
+    MGB_MUTEX m_watcher_stop_mtx, m_global_mtx;
 
     void clear_sock_fd() {
         if (m_peer_fd)
@@ -567,8 +577,10 @@ class TimedFuncInvokerImpl final: public TimedFuncInvoker {
         auto start = high_resolution_clock::now(),
              end = start + timeout_due;
         for (; ; ) {
+#if !__DEPLOY_ON_XP_SP2__
             std::unique_lock<std::mutex> lk(m_watcher_stop_mtx);
             m_watcher_stop_cv.wait_until(lk, end);
+#endif
 
             if (m_watcher_should_stop)
                 return false;
@@ -603,10 +615,9 @@ class TimedFuncInvokerImpl final: public TimedFuncInvoker {
             } MGB_CATCH(..., {});
             clear_sock_fd();
         }
-
 };
 
-} // anonymous namespace
+}  // anonymous namespace
 
 TimedFuncInvoker& TimedFuncInvoker::ins() {
     static TimedFuncInvokerImpl impl;
diff --git a/src/core/include/megbrain/common.h b/src/core/include/megbrain/common.h
index 5283e1b8..1972c14c 100644
--- a/src/core/include/megbrain/common.h
+++ b/src/core/include/megbrain/common.h
@@ -205,6 +205,21 @@ void __log__(LogLevel level, const char *file, const char *func, int line,
 #define MGB_TOKENPASTE2(x, y) MGB_TOKENPASTE(x, y)
 #define MGB_LOCK_GUARD_CTOR(mtx) MGB_TOKENPASTE2(__lock_guard_, __LINE__)(mtx)
 
+#if __DEPLOY_ON_XP_SP2__
+//! refer to
+//! https://docs.microsoft.com/en-us/cpp/build/configuring-programs-for-windows-xp?view=msvc-160
+//! xp sp2 do not support vc runtime fully, casused by KERNEL32.dll do not
+//! implement some base apis for c++ std function, for example,
+//! std::mutex/std::thread/std::condition_variable as a workround, we will
+//! disable some MegEngine feature on xp sp2 env, for exampe, multi-thread etc!
+#define MGB_MUTEX size_t
+#define MGB_RECURSIVE_MUTEX size_t
+#define MGB_LOCK_GUARD(mtx) MGB_MARK_USED_VAR(mtx)
+#define MGB_LOCK_GUARD_UNIQUE(mtx) MGB_MARK_USED_VAR(mtx)
+#define MGB_LOCK_GUARD_SHARED(mtx) MGB_MARK_USED_VAR(MGB_MARK_USED_VAR)
+#else
+#define MGB_MUTEX std::mutex
+#define MGB_RECURSIVE_MUTEX std::recursive_mutex
 #define MGB_LOCK_GUARD(mtx) \
     std::lock_guard<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
 
@@ -212,7 +227,8 @@ void __log__(LogLevel level, const char *file, const char *func, int line,
     std::unique_lock<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
 
 #define MGB_LOCK_GUARD_SHARED(mtx) \
-	std::shared_lock<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
+    std::shared_lock<decltype(mtx)> MGB_LOCK_GUARD_CTOR(mtx)
+#endif
 
 /*!
  * \brief printf-like std::string constructor
diff --git a/src/core/include/megbrain/graph/var_node.h b/src/core/include/megbrain/graph/var_node.h
index 0ef2fdb6..fedb7fae 100644
--- a/src/core/include/megbrain/graph/var_node.h
+++ b/src/core/include/megbrain/graph/var_node.h
@@ -222,7 +222,7 @@ class MemAllocPlan final: public json::Serializable, public NonCopyableObj {
     private:
         class ReadonlyFwdList {
             MemAllocPlan *m_prev = nullptr, *m_next = nullptr;
-            static std::mutex list_mutex;
+            static MGB_MUTEX list_mutex;
         public:
             MemAllocPlan* next() const { return m_next; }
             void reset();
diff --git a/src/core/include/megbrain/system.h b/src/core/include/megbrain/system.h
index 66b83690..1b2cd47e 100644
--- a/src/core/include/megbrain/system.h
+++ b/src/core/include/megbrain/system.h
@@ -27,11 +27,13 @@ namespace sys {
     //! set name of caller thread
     void set_thread_name(const std::string &name);
 
+#if !__DEPLOY_ON_XP_SP2__
     /*!
      * \brief get name of of given thread
      * \param tid thread id, or None to for the caller thread
      */
     std::string get_thread_name(Maybe<std::thread::id> tid = None);
+#endif
 
     //! get number of CPU cores on this system
     int get_cpu_count();
diff --git a/src/core/include/megbrain/utils/async_worker_impl_0.h b/src/core/include/megbrain/utils/async_worker_impl_0.h
index 2801a188..6dd96437 100644
--- a/src/core/include/megbrain/utils/async_worker_impl_0.h
+++ b/src/core/include/megbrain/utils/async_worker_impl_0.h
@@ -35,14 +35,20 @@ class AsyncWorkerSet final: public NonCopyableObj {
 };
 
 class FutureThreadPoolBase : public NonCopyableObj {
+#if !__DEPLOY_ON_XP_SP2__
     std::vector<std::thread::id> m_ids;
+#endif
     public:
         FutureThreadPoolBase(const Maybe<std::string>& = None) {}
 
+#if __DEPLOY_ON_XP_SP2__
+        size_t start(size_t concurrency) { return concurrency; }
+#else
         const std::vector<std::thread::id>& start(size_t concurrency) {
             m_ids.resize(concurrency, std::this_thread::get_id());
             return m_ids;
         }
+#endif
 
         void stop() {
         }
diff --git a/src/core/include/megbrain/utils/event.h b/src/core/include/megbrain/utils/event.h
index a6ce7c19..66a33cff 100644
--- a/src/core/include/megbrain/utils/event.h
+++ b/src/core/include/megbrain/utils/event.h
@@ -53,7 +53,7 @@ class SyncEventConnecter: public NonCopyableObj {
     using ReceiverMap = ThinHashMap<Typeinfo*, ReceiverList>;
 
     bool m_is_empty = true;
-    std::mutex m_mtx;
+    MGB_MUTEX m_mtx;
     //! map from type to receiver; use shared_ptr because it would be kept by
     //! handlers
     std::shared_ptr<ReceiverMap> m_receiver_map =
diff --git a/src/core/include/megbrain/utils/persistent_cache.h b/src/core/include/megbrain/utils/persistent_cache.h
index 523a4ad6..4871dcce 100644
--- a/src/core/include/megbrain/utils/persistent_cache.h
+++ b/src/core/include/megbrain/utils/persistent_cache.h
@@ -83,7 +83,7 @@ namespace mgb {
                 std::string,
                 std::unordered_map<BlobStorage, BlobStorage, BlobStorage::Hash>>
                 m_cache;
-        std::mutex m_mtx;
+        MGB_MUTEX m_mtx;
     };
 
     /*!
diff --git a/src/opr/impl/basic_arith.cpp b/src/opr/impl/basic_arith.cpp
index d7554fb6..ee63cafb 100644
--- a/src/opr/impl/basic_arith.cpp
+++ b/src/opr/impl/basic_arith.cpp
@@ -33,7 +33,7 @@ namespace {
     template<class Opr>
     class StaticInferOpr {
         intl::UniqPtrWithCN<Opr> m_opr;
-        std::mutex m_mtx;
+        MGB_MUTEX m_mtx;
 
         public:
             class Lock {
@@ -43,7 +43,9 @@ namespace {
                 explicit Lock(StaticInferOpr *owner):
                     m_owner{owner}
                 {
+#if !__DEPLOY_ON_XP_SP2__
                     m_owner->m_mtx.lock();
+#endif
                 }
 
                 public:
@@ -54,8 +56,10 @@ namespace {
                     }
 
                     ~Lock() {
+#if !__DEPLOY_ON_XP_SP2__
                         if (m_owner)
                             m_owner->m_mtx.unlock();
+#endif
                     }
 
                     Lock& operator = (const Lock &) = delete;
diff --git a/src/opr/impl/internal/indexing_helper.cpp b/src/opr/impl/internal/indexing_helper.cpp
index db4f9087..29ccf9b6 100644
--- a/src/opr/impl/internal/indexing_helper.cpp
+++ b/src/opr/impl/internal/indexing_helper.cpp
@@ -277,7 +277,7 @@ SubTensorSpec FancyIndexingHelper::fancy_indexing_make_sub_spec(
     mgb_assert(m_require_scalar_index || !fake_single_idx);
 
     static DeviceTensorND fake_val;
-    static std::mutex fake_val_mtx;
+    static MGB_MUTEX fake_val_mtx;
 
     if (mgb_unlikely(fake_val.empty())) {
         MGB_LOCK_GUARD(fake_val_mtx);
diff --git a/src/opr/impl/internal/megdnn_opr_wrapper.cpp b/src/opr/impl/internal/megdnn_opr_wrapper.cpp
index a920dcbe..ae34d63d 100644
--- a/src/opr/impl/internal/megdnn_opr_wrapper.cpp
+++ b/src/opr/impl/internal/megdnn_opr_wrapper.cpp
@@ -53,7 +53,7 @@ namespace {
         MGB_TYPEINFO_OBJ_DECL;
 
         public:
-            std::mutex mtx;
+            MGB_MUTEX mtx;
             CompNode::UnorderedMap<DeviceTensorStorage> cn2storage;
     };
     MGB_TYPEINFO_OBJ_IMPL(TempStorageContainer);
diff --git a/src/opr/impl/io.cpp b/src/opr/impl/io.cpp
index 316e4969..4a691213 100644
--- a/src/opr/impl/io.cpp
+++ b/src/opr/impl/io.cpp
@@ -377,7 +377,7 @@ MGB_DYN_TYPE_OBJ_FINAL_IMPL(SharedDeviceTensorWithFormat);
 MGB_DYN_TYPE_OBJ_FINAL_IMPL(ImmutableTensor);
 
 class ImmutableTensor::Value {
-    std::mutex m_mtx;
+    MGB_MUTEX m_mtx;
     DeviceTensorND m_dev, m_static_infer;
     std::string m_summary;
 
@@ -527,7 +527,7 @@ class ImmutableTensor::DevValueCache final: public UserDataContainer::UserData {
     std::unordered_map<TensorKey, Value, Hash> m_tensor2val;
     std::unordered_map<ScalarKey, Value, Hash> m_scalar2val;
 
-    std::mutex m_mtx;
+    MGB_MUTEX m_mtx;
 
     void setup_value(Value &dest, const HostTensorND &val) {
         dest.setup(m_comp_node, val);
diff --git a/src/opr/impl/loop/impl.cpp b/src/opr/impl/loop/impl.cpp
index d2f6b61b..3207104c 100644
--- a/src/opr/impl/loop/impl.cpp
+++ b/src/opr/impl/loop/impl.cpp
@@ -888,7 +888,7 @@ class LoopImpl::MutableStateSaver::Recorder final: public NonCopyableObj {
 
     //! mutex for m_saved_buckets, used between copy_bucket_to_host() and the
     //! async copy task in m_copy_threadpool
-    std::mutex m_saved_buckets_mtx;
+    MGB_MUTEX m_saved_buckets_mtx;
     //! see on_fwd_finish()
     TensorShape m_var_shape;
     bool m_enabled = false;
diff --git a/src/opr/impl/search_policy/profiler.cpp b/src/opr/impl/search_policy/profiler.cpp
index d2272abe..379c208e 100644
--- a/src/opr/impl/search_policy/profiler.cpp
+++ b/src/opr/impl/search_policy/profiler.cpp
@@ -356,7 +356,9 @@ typename TimedProfiler<Opr>::TResult TimedProfiler<Opr>::prof_impl(
             next_report_time = timer.get_secs() + 1;
         }
         using namespace std::literals;
+#if !__DEPLOY_ON_XP_SP2__
         std::this_thread::sleep_for(1000us);
+#endif
     }
     // release all free blocks owned by child process,
     // in order to avoid main process running out of memory
diff --git a/src/opr/impl/utility.cpp b/src/opr/impl/utility.cpp
index 808e858f..2aa82fd2 100644
--- a/src/opr/impl/utility.cpp
+++ b/src/opr/impl/utility.cpp
@@ -731,7 +731,7 @@ class PersistentOutputStorage::StorageHolder final
                                      key.second);
         }
     };
-    std::mutex m_mtx;
+    MGB_MUTEX m_mtx;
     std::unordered_map<Key, DeviceTensorStorage, KeyHash> m_storage;
 
 public:
diff --git a/src/plugin/impl/var_value_checker.cpp b/src/plugin/impl/var_value_checker.cpp
index 04ae4c7d..6bf07464 100644
--- a/src/plugin/impl/var_value_checker.cpp
+++ b/src/plugin/impl/var_value_checker.cpp
@@ -125,9 +125,13 @@ void VarValueChecker::on_var_computed(VarNode *var) {
     }
 
     if (!m_init_val_dumped) {
+#if !__DEPLOY_ON_XP_SP2__
         m_var2val_mtx.lock();
-        auto &&val = m_var2val[var];
+#endif
+        auto&& val = m_var2val[var];
+#if !__DEPLOY_ON_XP_SP2__
         m_var2val_mtx.unlock();
+#endif
 
         mgb_assert(!val);
         val = std::make_shared<DeviceTensorND>();
diff --git a/src/plugin/include/megbrain/plugin/cpu_dispatch_checker.h b/src/plugin/include/megbrain/plugin/cpu_dispatch_checker.h
index cacca348..fb30563c 100644
--- a/src/plugin/include/megbrain/plugin/cpu_dispatch_checker.h
+++ b/src/plugin/include/megbrain/plugin/cpu_dispatch_checker.h
@@ -22,7 +22,7 @@ namespace mgb {
      * This is intended to find potential bugs in megdnn.
      */
     class CPUDispatchChecker final: public PluginBase {
-        std::mutex m_cn2nr_task_mtx,
+        MGB_MUTEX m_cn2nr_task_mtx,
             m_failed_oprs_mtx_storage,
             *m_failed_oprs_mtx = &m_failed_oprs_mtx_storage;
         CompNode::UnorderedMap<size_t> m_cn2nr_task;
diff --git a/src/plugin/include/megbrain/plugin/opr_io_dump.h b/src/plugin/include/megbrain/plugin/opr_io_dump.h
index 84bc55de..00dfe0cc 100644
--- a/src/plugin/include/megbrain/plugin/opr_io_dump.h
+++ b/src/plugin/include/megbrain/plugin/opr_io_dump.h
@@ -60,7 +60,7 @@ class TextOprIODump final : public OprIODumpBase {
     bool m_print_addr = true;
     std::shared_ptr<FILE> m_fout;
     size_t m_max_size = 5;
-    std::mutex m_mtx;
+    MGB_MUTEX m_mtx;
     std::unique_ptr<LazyValueRecorder> m_lazy_value;
 
     void dump_var(VarNode* var, bool lazy_sync) override;
diff --git a/src/plugin/include/megbrain/plugin/var_sanity_check.h b/src/plugin/include/megbrain/plugin/var_sanity_check.h
index 7f32cc8e..f18b0dde 100644
--- a/src/plugin/include/megbrain/plugin/var_sanity_check.h
+++ b/src/plugin/include/megbrain/plugin/var_sanity_check.h
@@ -64,7 +64,7 @@ class VarSanityCheck final : public PluginBase {
 
     //! map from caller thread to workspace map
     ThinHashMap<std::thread::id, WorkspaceCache> m_workspace;
-    std::mutex m_workspace_mtx;
+    MGB_MUTEX m_workspace_mtx;
 
     ThinHashMap<VarNode*, ChecksumResult> m_var2chksum;
     /*! the ids of varnodes that have been modified by recv_opr
@@ -72,7 +72,7 @@ class VarSanityCheck final : public PluginBase {
      * cg::OperatorNodeBase::NodeProp::Flag:: FORCE_UPDATE_INPUT_VAR.
      */
     ThinHashSet<VarNode*> m_modified_vars;
-    std::mutex m_id2chksum_mtx;
+    MGB_MUTEX m_id2chksum_mtx;
 
     typedef void (VarSanityCheck::*input_checker_fn)(cg::OperatorNodeBase*,
                                                      VarNode*);
diff --git a/src/plugin/include/megbrain/plugin/var_value_checker.h b/src/plugin/include/megbrain/plugin/var_value_checker.h
index b9b356f7..2041a337 100644
--- a/src/plugin/include/megbrain/plugin/var_value_checker.h
+++ b/src/plugin/include/megbrain/plugin/var_value_checker.h
@@ -50,7 +50,7 @@ namespace mgb {
         size_t m_cur_var_idx, m_nr_exec;
 
         VarNodeArray m_vars;
-        std::mutex m_var2val_mtx;
+        MGB_MUTEX m_var2val_mtx;
         ThinHashMap<VarNode*, std::shared_ptr<DeviceTensorND>> m_var2val;
         Checker m_checker;