From 0a665ea4527e1a5f1ddd029d4134337cb77b9eec Mon Sep 17 00:00:00 2001
From: Megvii Engine Team <megengine@megvii.com>
Date: Wed, 11 Aug 2021 17:18:00 +0800
Subject: [PATCH] feat(mge/device): enable to get cuda compute capability

GitOrigin-RevId: b5d3f2225cf946378c7e26fe67d9c3ed38c065c0
---
 imperative/python/megengine/device.py              | 13 ++++++
 imperative/python/src/common.cpp                   |  2 +
 .../test/unit/distributed/test_distributed.py      | 15 +++++++
 src/core/impl/comp_node/comp_node.cpp              | 10 +++++
 src/core/impl/comp_node/cuda/comp_node.cpp         | 51 +++++++++++++++++-----
 src/core/impl/comp_node/cuda/comp_node.h           |  1 +
 src/core/include/megbrain/comp_node.h              |  4 ++
 7 files changed, 85 insertions(+), 11 deletions(-)

diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py
index 06a93313..56ab0428 100644
--- a/imperative/python/megengine/device.py
+++ b/imperative/python/megengine/device.py
@@ -11,6 +11,9 @@ import re
 from typing import Optional
 
 from .core._imperative_rt.common import CompNode, DeviceType
+from .core._imperative_rt.common import (
+    get_cuda_compute_capability as _get_cuda_compute_capability,
+)
 from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
 from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
 
@@ -20,6 +23,7 @@ __all__ = [
     "get_default_device",
     "set_default_device",
     "get_mem_status_bytes",
+    "get_cuda_compute_capability",
     "set_prealloc_config",
     "DeviceType",
 ]
@@ -126,6 +130,15 @@ def get_mem_status_bytes(device: Optional[str] = None):
     return tot, free
 
 
+def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int:
+    r"""
+    Get compute capability of the specified device.
+
+    It returns a version number, or `SM version`.
+    """
+    return _get_cuda_compute_capability(device, device_type)
+
+
 set_default_device(os.getenv("MGE_DEFAULT_DEVICE", "xpux"))
 
 
diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp
index b3704b74..3c2c4e7c 100644
--- a/imperative/python/src/common.cpp
+++ b/imperative/python/src/common.cpp
@@ -185,6 +185,8 @@ void init_common(py::module m) {
     m.def("set_prealloc_config", &CompNode::set_prealloc_config, 
         "specifies how to pre-allocate from raw dev allocator");
 
+    m.def("get_cuda_compute_capability", &CompNode::get_compute_capability);
+
     m.def("what_is_xpu", []{
         return CompNode::Locator::parse("xpux").to_physical().type;
     });
diff --git a/imperative/python/test/unit/distributed/test_distributed.py b/imperative/python/test/unit/distributed/test_distributed.py
index 0fcae864..0ed0c82d 100644
--- a/imperative/python/test/unit/distributed/test_distributed.py
+++ b/imperative/python/test/unit/distributed/test_distributed.py
@@ -229,3 +229,18 @@ def test_user_set_pop():
             assert ret == 1
 
     worker()
+
+
+@pytest.mark.require_ngpu(2)
+@pytest.mark.isolated_distributed
+def test_get_cuda_compute_capability():
+
+    assert mge.device.get_cuda_compute_capability(0) > 0
+    assert mge.device.get_cuda_compute_capability(1) > 0
+
+    @dist.launcher
+    def worker():
+        x = mge.tensor([1.0])
+        assert mge.device.get_cuda_compute_capability(dist.get_rank()) > 0
+
+    worker()
diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp
index cca03b65..a5824258 100644
--- a/src/core/impl/comp_node/comp_node.cpp
+++ b/src/core/impl/comp_node/comp_node.cpp
@@ -444,6 +444,16 @@ void CompNode::set_prealloc_config(
     };
 }
 
+size_t CompNode::get_compute_capability(int dev, DeviceType device_type) {
+    switch (device_type) {
+        case DeviceType::CUDA:
+            return CudaCompNode::get_compute_capability(dev);
+        default:
+            mgb_log_warn("unsupport device type for get_compute_capability");
+            return 0;
+    };
+}
+
 void* CompNode::alloc_device(size_t size) const {
     auto ret = m_impl->alloc_device(size);
     static_cast<Impl*>(m_impl)->env().on_mem_event(size, true, ret);
diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp
index b6b18268..81326444 100644
--- a/src/core/impl/comp_node/cuda/comp_node.cpp
+++ b/src/core/impl/comp_node/cuda/comp_node.cpp
@@ -202,6 +202,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
     //! enable peer copy from dev0 to dev1
     static void enable_peer_access(int dev0, int dev1);
 
+    static size_t get_compute_capability(int dev);
+
     static void static_free_device(ImplBase* self, void* ptr) {
         static_cast<CompNodeImpl*>(self)->free_device(ptr);
     }
@@ -709,9 +711,10 @@ void CudaCompNode::EventImpl::do_device_wait_by(Impl* cn_impl) {
 namespace {
 
 #ifndef __unix__
-CUresult get_device_count_forksafe(int* pcnt) {
+template<typename Func, typename... Args>
+CUresult call_cuda_forksafe(Func func, Args... args) {
     cuInit(0);
-    return cuDeviceGetCount(pcnt);
+    return func(args...);
 }
 #else
 struct RAIICloseFD : NonCopyableObj {
@@ -727,8 +730,9 @@ struct RAIICloseFD : NonCopyableObj {
     }
 };
 // an implementation that does not call cuInit
-CUresult get_device_count_forksafe(int* pcnt) {
-    auto err = cuDeviceGetCount(pcnt);
+template<typename Func, typename Val, typename... Args>
+CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
+    auto err = func(val, args...);
     if (err != CUDA_ERROR_NOT_INITIALIZED) return err;
     // cuInit not called, call it in child process
     int fd[2];
@@ -743,11 +747,11 @@ CUresult get_device_count_forksafe(int* pcnt) {
         do {
             err = cuInit(0);
             if (err != CUDA_SUCCESS) break;
-            err = cuDeviceGetCount(pcnt);
+            err = func(val, args...);
         } while (0);
         auto sz = write(fdw, &err, sizeof(err));
         if (sz == sizeof(err) && err == CUDA_SUCCESS) {
-            sz = write(fdw, pcnt, sizeof(*pcnt));
+            sz = write(fdw, val, sizeof(*val));
         }
         fdw_guard.close();
         std::quick_exit(0);
@@ -756,12 +760,12 @@ CUresult get_device_count_forksafe(int* pcnt) {
     auto sz = read(fdr, &err, sizeof(err));
     mgb_assert(sz == sizeof(err), "failed to read error code from child");
     if (err == CUDA_SUCCESS) {
-        sz = read(fdr, pcnt, sizeof(*pcnt));
-        mgb_assert(sz == sizeof(*pcnt), "failed to read device count from child");
+        sz = read(fdr, val, sizeof(*val));
+        mgb_assert(sz == sizeof(*val), "failed to read value from child");
         return err;
     }
     // try again, maybe another thread called cuInit while we fork
-    auto err2 = cuDeviceGetCount(pcnt);
+    auto err2 = func(val, args...);
     if (err2 == CUDA_SUCCESS) return err2;
     if (err2 == CUDA_ERROR_NOT_INITIALIZED) return err;
     return err2;
@@ -783,7 +787,7 @@ bool CudaCompNode::available() {
     MGB_LOCK_GUARD(mtx);
     if (result == -1) {
         int ndev = -1;
-        auto err = get_device_count_forksafe(&ndev);
+        auto err = call_cuda_forksafe(cuDeviceGetCount, &ndev);
         result = err == CUDA_SUCCESS && ndev > 0;
         if (!result) {
             mgb_log_warn("cuda unavailable: %s(%d) ndev=%d",
@@ -934,7 +938,7 @@ size_t CudaCompNode::get_device_count(bool warn) {
     static Spinlock mtx;
     MGB_LOCK_GUARD(mtx);
     if (cnt == -1) {
-        auto err = get_device_count_forksafe(&cnt);
+        auto err = call_cuda_forksafe(cuDeviceGetCount, &cnt);
         if (err != CUDA_SUCCESS) {
             if (warn)
                 mgb_log_error("cudaGetDeviceCount failed: %s (err %d)",
@@ -970,6 +974,27 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
     }
 }
 
+size_t CudaCompNode::get_compute_capability(int dev) {
+    size_t cnt = get_device_count();
+    if (dev < 0 || dev >= static_cast<int>(cnt)) {
+        mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt);
+        return 0;
+    }
+    static Spinlock mtx_com;
+    MGB_LOCK_GUARD(mtx_com);
+    int pmajor;
+    int pminor;
+    auto err = call_cuda_forksafe(cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev);
+    if (err != CUDA_SUCCESS) {
+        return 0;
+    }
+    auto err2 = call_cuda_forksafe(cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev);
+    if (err2 != CUDA_SUCCESS) {
+        return 0;
+    }
+    return pmajor * 10 + pminor;
+}
+
 #else
 
 bool CudaCompNode::available() {
@@ -990,6 +1015,10 @@ void CudaCompNode::set_prealloc_config(size_t alignment, size_t min_req,
                                        size_t max_overhead,
                                        double growth_factor) {}
 
+size_t CudaCompNode::get_compute_capability(int dev) {
+    return 0;
+}
+
 #undef err
 
 #endif  // MGB_CUDA
diff --git a/src/core/impl/comp_node/cuda/comp_node.h b/src/core/impl/comp_node/cuda/comp_node.h
index 83e662d7..6d555239 100644
--- a/src/core/impl/comp_node/cuda/comp_node.h
+++ b/src/core/impl/comp_node/cuda/comp_node.h
@@ -33,6 +33,7 @@ namespace mgb {
             static Impl* load_cuda(
                     const Locator &locator, const Locator &locator_logical);
             static void sync_all();
+            static size_t get_compute_capability(int dev);
 
             static void set_prealloc_config(size_t alignment, size_t min_req,
                                             size_t max_overhead, double growth_factor);
diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h
index d03a3efd..67dd2f6d 100644
--- a/src/core/include/megbrain/comp_node.h
+++ b/src/core/include/megbrain/comp_node.h
@@ -298,6 +298,10 @@ class CompNode {
         static void set_prealloc_config(size_t alignment, size_t min_req,
                                         size_t max_overhead, double growth_factor,
                                         DeviceType device_type);
+        /*!
+         * \brief get compute capability of the specified device
+         */
+        static size_t get_compute_capability(int dev, DeviceType device_type);
 
         /* =================== synchronization ======================== */