From cf53d9e0f82aff73edab767044b59e72c35a5b29 Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Mon, 7 Dec 2020 19:04:35 +0800 Subject: [PATCH] fix(mgb/tensor): do tensor overlap check only when d2d and h2h GitOrigin-RevId: 9125936a350baab699961b26decd314a39921545 --- src/core/impl/comp_node/atlas/comp_node.h | 4 +- src/core/impl/comp_node/cambricon/comp_node.h | 5 +- src/core/impl/comp_node/cpu/comp_node.h | 3 +- src/core/impl/comp_node/cuda/comp_node.h | 5 +- src/core/impl/comp_node/rocm/comp_node.h | 4 +- src/core/impl/tensor.cpp | 81 ++++++++++++++++++++++----- src/core/include/megbrain/comp_node.h | 4 ++ 7 files changed, 84 insertions(+), 22 deletions(-) diff --git a/src/core/impl/comp_node/atlas/comp_node.h b/src/core/impl/comp_node/atlas/comp_node.h index 3bf95ea6..644b1836 100644 --- a/src/core/impl/comp_node/atlas/comp_node.h +++ b/src/core/impl/comp_node/atlas/comp_node.h @@ -18,7 +18,9 @@ namespace mgb { class AtlasCompNode final : public CompNodeImplHelper { public: - static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM; + static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | + Flag::HAS_COPY_STREAM | + Flag::SUPPORT_UNIFIED_ADDRESS; class CompNodeImpl; class EventImpl; diff --git a/src/core/impl/comp_node/cambricon/comp_node.h b/src/core/impl/comp_node/cambricon/comp_node.h index a9fb6794..b4c04495 100644 --- a/src/core/impl/comp_node/cambricon/comp_node.h +++ b/src/core/impl/comp_node/cambricon/comp_node.h @@ -16,8 +16,9 @@ namespace mgb { class CambriconCompNode final: public CompNodeImplHelper { public: - static constexpr Flag sm_flag = - Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM; + static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | + Flag::HAS_COPY_STREAM | + Flag::SUPPORT_UNIFIED_ADDRESS; class CompNodeImpl; class EventImpl; diff --git a/src/core/impl/comp_node/cpu/comp_node.h b/src/core/impl/comp_node/cpu/comp_node.h index d8a79ecd..58ea7cca 100644 --- a/src/core/impl/comp_node/cpu/comp_node.h +++ b/src/core/impl/comp_node/cpu/comp_node.h @@ -38,7 +38,8 @@ namespace mgb { static constexpr Flag sm_flag = Flag::SUPPORT_RECORDER | Flag::RECORDER_SUPPORT_DYNAMIC_ALLOC | - Flag::EVENT_DTOR_UNSAFE; + Flag::EVENT_DTOR_UNSAFE | + Flag::SUPPORT_UNIFIED_ADDRESS; //! base class for comp nodes that can be dispatched on CPU. //! This is currently used by CPU, FPGA and CADENCE diff --git a/src/core/impl/comp_node/cuda/comp_node.h b/src/core/impl/comp_node/cuda/comp_node.h index f394e560..b4fd8444 100644 --- a/src/core/impl/comp_node/cuda/comp_node.h +++ b/src/core/impl/comp_node/cuda/comp_node.h @@ -16,8 +16,9 @@ namespace mgb { class CudaCompNode final: public CompNodeImplHelper { public: - static constexpr Flag sm_flag = - Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM; + static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | + Flag::HAS_COPY_STREAM | + Flag::SUPPORT_UNIFIED_ADDRESS; class CompNodeImpl; class EventImpl; diff --git a/src/core/impl/comp_node/rocm/comp_node.h b/src/core/impl/comp_node/rocm/comp_node.h index 8725855d..6d2a9524 100644 --- a/src/core/impl/comp_node/rocm/comp_node.h +++ b/src/core/impl/comp_node/rocm/comp_node.h @@ -16,7 +16,9 @@ namespace mgb { class ROCmCompNode final : public CompNodeImplHelper { public: - static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | Flag::HAS_COPY_STREAM; + static constexpr Flag sm_flag = Flag::QUEUE_LIMITED | + Flag::HAS_COPY_STREAM | + Flag::SUPPORT_UNIFIED_ADDRESS; class CompNodeImpl; class EventImpl; diff --git a/src/core/impl/tensor.cpp b/src/core/impl/tensor.cpp index d811056e..e6c94322 100644 --- a/src/core/impl/tensor.cpp +++ b/src/core/impl/tensor.cpp @@ -518,6 +518,60 @@ DEF(sub, )(const SubTensorSpec &spec) const { // def } /* ===================== TensorND::copy_from ===================== */ +namespace { +/** + * \brief determine whether to check overlap of two tensors. + * \return true : when HostStorage || (DeviceStorage && SUPPORT_UNIFIED_ADDRESS) + * \note when both support unified address, we can treat them both on CPU. So, + * overlap check should be done + */ +template +inline bool should_check_overlap(const TensorND& dst, + const TensorND& src) { + return true; +} + +template <> +inline bool should_check_overlap( + const HostTensorND& dst, const DeviceTensorND& src) { + return src.comp_node().contain_flag( + CompNode::Flag::SUPPORT_UNIFIED_ADDRESS); +} + +template <> +inline bool should_check_overlap( + const DeviceTensorND& dst, const HostTensorND& src) { + return dst.comp_node().contain_flag( + CompNode::Flag::SUPPORT_UNIFIED_ADDRESS); +} + +/** + * \brief D2D tensor copy should check overlap when + * 1. They are on the same mem node. But note that the address must be logical + * comparable. i.e. the original address alloc on enflame is uncomparable. + * 2. They both support unified address, so can be treated as CPU address. + */ +template <> +inline bool should_check_overlap( + const DeviceTensorND& dst, const DeviceTensorND& src) { + bool is_same_memnode = + dst.comp_node().mem_node() == src.comp_node().mem_node(); + bool unified_address = src.comp_node().contain_flag( + CompNode::Flag::SUPPORT_UNIFIED_ADDRESS) && + dst.comp_node().contain_flag( + CompNode::Flag::SUPPORT_UNIFIED_ADDRESS); + return is_same_memnode || unified_address; +} + +/** + * \brief check overlap of two tensors. throw exception when overlapped + */ +inline void check_overlapped(const dt_byte* dst_min, const dt_byte* dst_max, + const dt_byte* src_min, const dt_byte* src_max) { + mgb_throw_if(src_min < dst_max && dst_min < src_max, TensorCopyOverlapError, + "cound not perform copy between overlapped tensors"); +} +} // namespace template template @@ -539,12 +593,12 @@ TensorND::copy_from(const TensorND &src) { return static_cast(*this); } if (src.layout().is_physical_contiguous()) { - const dt_byte - *dst_min = m_storage.ptr(), *dst_max = dst_min + size_bytes, - *src_min = src.storage().ptr(), *src_max = src_min + size_bytes; - mgb_throw_if(src_max > dst_min && dst_max > src_min, - TensorCopyOverlapError, - "cound not perform copy between overlapped tensors"); + if (should_check_overlap(*this, src)) { + check_overlapped(m_storage.ptr(), + m_storage.ptr() + size_bytes, + src.storage().ptr(), + src.storage().ptr() + size_bytes); + } m_storage.copy_from(src.storage(), size_bytes); return static_cast(*this); } @@ -574,15 +628,12 @@ TensorND::copy_from_fixlayout( src_span = src.layout().span(), dst_span = layout().span(); - const dt_byte - *src_ptr_min = src.raw_ptr() + src_span.low_byte, - *src_ptr_max = src.raw_ptr() + src_span.high_byte, - *dst_ptr_min = this->raw_ptr() + dst_span.low_byte, - *dst_ptr_max = this->raw_ptr() + dst_span.high_byte; - - mgb_throw_if(src_ptr_max > dst_ptr_min && dst_ptr_max > src_ptr_min, - TensorCopyOverlapError, - "cound not perform copy between overlapped tensors"); + if (should_check_overlap(*this, src)) { + check_overlapped(this->raw_ptr() + dst_span.low_byte, + this->raw_ptr() + dst_span.high_byte, + src.raw_ptr() + src_span.low_byte, + src.raw_ptr() + src_span.high_byte); + } bool self_contig = m_layout.is_physical_contiguous(), src_contig = src.layout().is_physical_contiguous(); diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h index e2920f07..2dd50972 100644 --- a/src/core/include/megbrain/comp_node.h +++ b/src/core/include/megbrain/comp_node.h @@ -436,6 +436,10 @@ class CompNode { //! MGB_HAVE_THREAD=0. Usually this means that execution on the //! CompNode is synchronous, i.e. behaves like cpu:default SUPPORT_NO_THREAD = 1 << 5, + + //! Whether this comp node supports unified address. i.e. CPU and + //! CUDA supports unified address. + SUPPORT_UNIFIED_ADDRESS = 1 << 6, }; bool contain_flag(Flag flag) {