From 0a665ea4527e1a5f1ddd029d4134337cb77b9eec Mon Sep 17 00:00:00 2001 From: Megvii Engine Team Date: Wed, 11 Aug 2021 17:18:00 +0800 Subject: [PATCH] feat(mge/device): enable to get cuda compute capability GitOrigin-RevId: b5d3f2225cf946378c7e26fe67d9c3ed38c065c0 --- imperative/python/megengine/device.py | 13 ++++++ imperative/python/src/common.cpp | 2 + .../test/unit/distributed/test_distributed.py | 15 +++++++ src/core/impl/comp_node/comp_node.cpp | 10 +++++ src/core/impl/comp_node/cuda/comp_node.cpp | 51 +++++++++++++++++----- src/core/impl/comp_node/cuda/comp_node.h | 1 + src/core/include/megbrain/comp_node.h | 4 ++ 7 files changed, 85 insertions(+), 11 deletions(-) diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py index 06a93313..56ab0428 100644 --- a/imperative/python/megengine/device.py +++ b/imperative/python/megengine/device.py @@ -11,6 +11,9 @@ import re from typing import Optional from .core._imperative_rt.common import CompNode, DeviceType +from .core._imperative_rt.common import ( + get_cuda_compute_capability as _get_cuda_compute_capability, +) from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config from .core._imperative_rt.common import what_is_xpu as _what_is_xpu @@ -20,6 +23,7 @@ __all__ = [ "get_default_device", "set_default_device", "get_mem_status_bytes", + "get_cuda_compute_capability", "set_prealloc_config", "DeviceType", ] @@ -126,6 +130,15 @@ def get_mem_status_bytes(device: Optional[str] = None): return tot, free +def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int: + r""" + Get compute capability of the specified device. + + It returns a version number, or `SM version`. + """ + return _get_cuda_compute_capability(device, device_type) + + set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux")) diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp index b3704b74..3c2c4e7c 100644 --- a/imperative/python/src/common.cpp +++ b/imperative/python/src/common.cpp @@ -185,6 +185,8 @@ void init_common(py::module m) { m.def("set_prealloc_config", &CompNode::set_prealloc_config, "specifies how to pre-allocate from raw dev allocator"); + m.def("get_cuda_compute_capability", &CompNode::get_compute_capability); + m.def("what_is_xpu", []{ return CompNode::Locator::parse("xpux").to_physical().type; }); diff --git a/imperative/python/test/unit/distributed/test_distributed.py b/imperative/python/test/unit/distributed/test_distributed.py index 0fcae864..0ed0c82d 100644 --- a/imperative/python/test/unit/distributed/test_distributed.py +++ b/imperative/python/test/unit/distributed/test_distributed.py @@ -229,3 +229,18 @@ def test_user_set_pop(): assert ret == 1 worker() + + +@pytest.mark.require_ngpu(2) +@pytest.mark.isolated_distributed +def test_get_cuda_compute_capability(): + + assert mge.device.get_cuda_compute_capability(0) > 0 + assert mge.device.get_cuda_compute_capability(1) > 0 + + @dist.launcher + def worker(): + x = mge.tensor([1.0]) + assert mge.device.get_cuda_compute_capability(dist.get_rank()) > 0 + + worker() diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp index cca03b65..a5824258 100644 --- a/src/core/impl/comp_node/comp_node.cpp +++ b/src/core/impl/comp_node/comp_node.cpp @@ -444,6 +444,16 @@ void CompNode::set_prealloc_config( }; } +size_t CompNode::get_compute_capability(int dev, DeviceType device_type) { + switch (device_type) { + case DeviceType::CUDA: + return CudaCompNode::get_compute_capability(dev); + default: + mgb_log_warn("unsupport device type for get_compute_capability"); + return 0; + }; +} + void* CompNode::alloc_device(size_t size) const { auto ret = m_impl->alloc_device(size); static_cast(m_impl)->env().on_mem_event(size, true, ret); diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp index b6b18268..81326444 100644 --- a/src/core/impl/comp_node/cuda/comp_node.cpp +++ b/src/core/impl/comp_node/cuda/comp_node.cpp @@ -202,6 +202,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { //! enable peer copy from dev0 to dev1 static void enable_peer_access(int dev0, int dev1); + static size_t get_compute_capability(int dev); + static void static_free_device(ImplBase* self, void* ptr) { static_cast(self)->free_device(ptr); } @@ -709,9 +711,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) { namespace { #ifndef __unix__ -CUresult get_device_count_forksafe(int* pcnt) { +template +CUresult call_cuda_forksafe(Func func, Args... args) { cuInit(0); - return cuDeviceGetCount(pcnt); + return func(args...); } #else struct RAIICloseFD : NonCopyableObj { @@ -727,8 +730,9 @@ struct RAIICloseFD : NonCopyableObj { } }; // an implementation that does not call cuInit -CUresult get_device_count_forksafe(int* pcnt) { - auto err = cuDeviceGetCount(pcnt); +template +CUresult call_cuda_forksafe(Func func, Val* val, Args... args) { + auto err = func(val, args...); if (err != CUDA_ERROR_NOT_INITIALIZED) return err; // cuInit not called, call it in child process int fd[2]; @@ -743,11 +747,11 @@ CUresult get_device_count_forksafe(int* pcnt) { do { err = cuInit(0); if (err != CUDA_SUCCESS) break; - err = cuDeviceGetCount(pcnt); + err = func(val, args...); } while (0); auto sz = write(fdw, &err, sizeof(err)); if (sz == sizeof(err) && err == CUDA_SUCCESS) { - sz = write(fdw, pcnt, sizeof(*pcnt)); + sz = write(fdw, val, sizeof(*val)); } fdw_guard.close(); std::quick_exit(0); @@ -756,12 +760,12 @@ CUresult get_device_count_forksafe(int* pcnt) { auto sz = read(fdr, &err, sizeof(err)); mgb_assert(sz == sizeof(err), "failed to read error code from child"); if (err == CUDA_SUCCESS) { - sz = read(fdr, pcnt, sizeof(*pcnt)); - mgb_assert(sz == sizeof(*pcnt), "failed to read device count from child"); + sz = read(fdr, val, sizeof(*val)); + mgb_assert(sz == sizeof(*val), "failed to read value from child"); return err; } // try again, maybe another thread called cuInit while we fork - auto err2 = cuDeviceGetCount(pcnt); + auto err2 = func(val, args...); if (err2 == CUDA_SUCCESS) return err2; if (err2 == CUDA_ERROR_NOT_INITIALIZED) return err; return err2; @@ -783,7 +787,7 @@ bool CudaCompNode::available() { MGB_LOCK_GUARD(mtx); if (result == -1) { int ndev = -1; - auto err = get_device_count_forksafe(&ndev); + auto err = call_cuda_forksafe(cuDeviceGetCount, &ndev); result = err == CUDA_SUCCESS && ndev > 0; if (!result) { mgb_log_warn("cuda unavailable: %s(%d) ndev=%d", @@ -934,7 +938,7 @@ size_t CudaCompNode::get_device_count(bool warn) { static Spinlock mtx; MGB_LOCK_GUARD(mtx); if (cnt == -1) { - auto err = get_device_count_forksafe(&cnt); + auto err = call_cuda_forksafe(cuDeviceGetCount, &cnt); if (err != CUDA_SUCCESS) { if (warn) mgb_log_error("cudaGetDeviceCount failed: %s (err %d)", @@ -970,6 +974,27 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req, } } +size_t CudaCompNode::get_compute_capability(int dev) { + size_t cnt = get_device_count(); + if (dev < 0 || dev >= static_cast(cnt)) { + mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt); + return 0; + } + static Spinlock mtx_com; + MGB_LOCK_GUARD(mtx_com); + int pmajor; + int pminor; + auto err = call_cuda_forksafe(cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev); + if (err != CUDA_SUCCESS) { + return 0; + } + auto err2 = call_cuda_forksafe(cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev); + if (err2 != CUDA_SUCCESS) { + return 0; + } + return pmajor * 10 + pminor; +} + #else bool CudaCompNode::available() { @@ -990,6 +1015,10 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} +size_t CudaCompNode::get_compute_capability(int dev) { + return 0; +} + #undef err #endif // MGB_CUDA diff --git a/src/core/impl/comp_node/cuda/comp_node.h b/src/core/impl/comp_node/cuda/comp_node.h index 83e662d7..6d555239 100644 --- a/src/core/impl/comp_node/cuda/comp_node.h +++ b/src/core/impl/comp_node/cuda/comp_node.h @@ -33,6 +33,7 @@ namespace mgb { static Impl* load_cuda( const Locator &locator, const Locator &locator_logical); static void sync_all(); + static size_t get_compute_capability(int dev); static void set_prealloc_config(size_t alignment, size_t min_req, size_t max_overhead, double growth_factor); diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h index d03a3efd..67dd2f6d 100644 --- a/src/core/include/megbrain/comp_node.h +++ b/src/core/include/megbrain/comp_node.h @@ -298,6 +298,10 @@ class CompNode { static void set_prealloc_config(size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, DeviceType device_type); + /*! + * \brief get compute capability of the specified device + */ + static size_t get_compute_capability(int dev, DeviceType device_type); /* =================== synchronization ======================== */