diff --git a/imperative/python/megengine/__init__.py b/imperative/python/megengine/__init__.py index bed7694d..860f6f0e 100644 --- a/imperative/python/megengine/__init__.py +++ b/imperative/python/megengine/__init__.py @@ -8,6 +8,7 @@ # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. import atexit import ctypes +import re import os import platform import sys @@ -89,6 +90,9 @@ if sys.platform == "win32": from .core._imperative_rt.core2 import close as _close from .core._imperative_rt.core2 import full_sync as _full_sync from .core._imperative_rt.core2 import sync as _sync +from .core._imperative_rt.common import ( + get_supported_sm_versions as _get_supported_sm_versions, +) from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func from .config import * from .device import * @@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer from .version import __version__ + +logger = get_logger(__name__) +ngpus = get_device_count("gpu") +supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions()) +for idx in range(ngpus): + prop = get_cuda_device_property(idx) + cur_sm = str(prop.major * 10 + prop.minor) + if not cur_sm in supported_sm_versions: + logger.warning( + "{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format( + prop.name, + cur_sm, + "capabilities" if len(supported_sm_versions) > 1 else "capability", + " ".join(["sm_" + v for v in supported_sm_versions]), + prop.name, + ) + ) + + _set_fork_exec_path_for_timed_func( sys.executable, os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), diff --git a/imperative/python/megengine/device.py b/imperative/python/megengine/device.py index c6497b89..8580ce17 100644 --- a/imperative/python/megengine/device.py +++ b/imperative/python/megengine/device.py @@ -11,9 +11,7 @@ import re from typing import Optional from .core._imperative_rt.common import CompNode, DeviceType -from .core._imperative_rt.common import ( - get_cuda_compute_capability as _get_cuda_compute_capability, -) +from .core._imperative_rt.common import get_device_prop as _get_device_prop from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config from .core._imperative_rt.common import what_is_xpu as _what_is_xpu from .core._imperative_rt.utils import _try_coalesce_all_free_memory @@ -25,6 +23,7 @@ __all__ = [ "set_default_device", "get_mem_status_bytes", "get_cuda_compute_capability", + "get_cuda_device_property", "get_allocated_memory", "get_reserved_memory", "get_max_reserved_memory", @@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int Returns: a version number, or `SM version`. """ - return _get_cuda_compute_capability(device, device_type) + prop = _get_device_prop(device, device_type) + return prop.major * 10 + prop.minor + + +def get_cuda_device_property(device: int, device_type=DeviceType.CUDA): + return _get_device_prop(device, device_type) def get_allocated_memory(device: Optional[str] = None): diff --git a/imperative/python/src/common.cpp b/imperative/python/src/common.cpp index 6a9ac154..90e09aea 100644 --- a/imperative/python/src/common.cpp +++ b/imperative/python/src/common.cpp @@ -123,6 +123,23 @@ void init_common(py::module m) { py::implicitly_convertible(); + py::class_(m, "DeviceProperties") + .def(py::init()) + .def_property_readonly( + "name", + [](const CompNode::DeviceProperties prop) { return prop.name; }) + .def_property_readonly( + "total_memory", + [](const CompNode::DeviceProperties prop) { + return prop.total_memory; + }) + .def_property_readonly( + "major", + [](const CompNode::DeviceProperties prop) { return prop.major; }) + .def_property_readonly("minor", [](const CompNode::DeviceProperties prop) { + return prop.minor; + }); + def_TensorND(m, "DeviceTensorND") .def("numpy", [](const DeviceTensorND& self) { HostTensorND hv; @@ -223,7 +240,12 @@ void init_common(py::module m) { m.def("set_prealloc_config", &CompNode::set_prealloc_config, "specifies how to pre-allocate from raw dev allocator"); - m.def("get_cuda_compute_capability", &CompNode::get_compute_capability); + m.def("get_device_prop", &CompNode::get_device_prop); + + m.def("get_supported_sm_versions", []() { + static const char* mge_gen_code = MGE_CUDA_GENCODE; + return mge_gen_code; + }); m.def("what_is_xpu", [] { return CompNode::Locator::parse("xpux").to_physical().type; }); diff --git a/src/core/impl/comp_node/comp_node.cpp b/src/core/impl/comp_node/comp_node.cpp index 1dbcedbe..bff186d1 100644 --- a/src/core/impl/comp_node/comp_node.cpp +++ b/src/core/impl/comp_node/comp_node.cpp @@ -431,13 +431,12 @@ void CompNode::set_prealloc_config( }; } -size_t CompNode::get_compute_capability(int dev, DeviceType device_type) { +CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) { switch (device_type) { case DeviceType::CUDA: - return CudaCompNode::get_compute_capability(dev); + return CudaCompNode::get_device_prop(dev); default: - mgb_log_warn("unsupport device type for get_compute_capability"); - return 0; + mgb_throw(MegBrainError, "unsupport device type for get_device_prop"); }; } diff --git a/src/core/impl/comp_node/cuda/comp_node.cpp b/src/core/impl/comp_node/cuda/comp_node.cpp index d31c9dc6..7a7fae11 100644 --- a/src/core/impl/comp_node/cuda/comp_node.cpp +++ b/src/core/impl/comp_node/cuda/comp_node.cpp @@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { //! return whether global finalized, and print warning in such case static inline bool check_global_finalized(); + static CompNode::DeviceProperties get_device_prop(int dev); + //! enable peer copy from dev0 to dev1 static void enable_peer_access(int dev0, int dev1); - static size_t get_compute_capability(int dev); - static void static_free_device(ImplBase* self, void* ptr) { static_cast(self)->free_device(ptr); } @@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { public: CompNodeImpl() : Impl(static_free_device, static_free_host) {} + static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; + void* alloc_device(size_t size) override; void free_device(void* ptr); @@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo { }; struct CudaCompNodeImpl::StaticData { - static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; - std::recursive_mutex mtx; mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; @@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData { CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr; Spinlock CudaCompNodeImpl::sd_mtx; +struct DevicePropRec { + bool init = false; + CompNode::DeviceProperties prop; + Spinlock mtx_com; +}; +DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE]; + void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) { m_locator = locator; m_locator_logical = locator_logical; @@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() { } void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) { - static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE]; + static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE]; if (already_enabled[dev0][dev1]) return; @@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) { return err; return err2; } +template +CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) { + auto err = func(val, len, args...); + if (err != CUDA_ERROR_NOT_INITIALIZED) + return err; + // cuInit not called, call it in child process + int fd[2]; + mgb_assert(pipe(fd) == 0, "pipe() failed"); + int fdr = fd[0], fdw = fd[1]; + RAIICloseFD fdr_guard(fdr); + RAIICloseFD fdw_guard(fdw); + auto cpid = fork(); + mgb_assert(cpid != -1, "fork() failed"); + if (cpid == 0) { + fdr_guard.close(); + do { + err = cuInit(0); + if (err != CUDA_SUCCESS) + break; + err = func(val, len, args...); + } while (0); + auto sz = write(fdw, &err, sizeof(err)); + if (sz == sizeof(err) && err == CUDA_SUCCESS) { + sz = write(fdw, val, sizeof(*val) * len); + } + fdw_guard.close(); + std::quick_exit(0); + } + fdw_guard.close(); + auto sz = read(fdr, &err, sizeof(err)); + mgb_assert(sz == sizeof(err), "failed to read error code from child"); + if (err == CUDA_SUCCESS) { + sz = read(fdr, val, sizeof(*val) * len); + mgb_assert( + static_cast(sz) == sizeof(*val) * static_cast(len), + "failed to read value from child"); + return err; + } + // try again, maybe another thread called cuInit while we fork + auto err2 = func(val, len, args...); + if (err2 == CUDA_SUCCESS) + return err2; + if (err2 == CUDA_ERROR_NOT_INITIALIZED) + return err; + return err2; +} #endif const char* cu_get_error_string(CUresult err) { @@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda( } if (!available_node) { - mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated"); + mgb_assert( + sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE, + "too many CompNode allocated"); available_node = &sd.node[sd.nr_node++]; } - mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large"); + mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large"); mgb_assert(!available_node->m_initialized); available_node->init(locator, locator_logical); @@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config( } } -size_t CudaCompNode::get_compute_capability(int dev) { - size_t cnt = get_device_count(); - if (dev < 0 || dev >= static_cast(cnt)) { - mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt); - return 0; - } - static Spinlock mtx_com; - MGB_LOCK_GUARD(mtx_com); - int pmajor; - int pminor; - auto err = call_cuda_forksafe( - cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, - dev); - if (err != CUDA_SUCCESS) { - return 0; - } - auto err2 = call_cuda_forksafe( - cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, - dev); - if (err2 != CUDA_SUCCESS) { - return 0; +CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) { + int cnt = static_cast(get_device_count()); + mgb_assert( + dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev, + cnt); + + auto&& rec = device_prop_rec[dev]; + if (!rec.init) { + MGB_LOCK_GUARD(rec.mtx_com); + if (!rec.init) { + char pname[256] = {0}; + mgb_assert( + call_cuda_forksafe( + cuDeviceGetAttribute, &rec.prop.major, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, + dev) == CUDA_SUCCESS); + mgb_assert( + call_cuda_forksafe( + cuDeviceGetAttribute, &rec.prop.minor, + CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, + dev) == CUDA_SUCCESS); + mgb_assert( + call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) == + CUDA_SUCCESS); + mgb_assert( + call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) == + CUDA_SUCCESS); + rec.prop.name = pname; + rec.init = true; + } } - return pmajor * 10 + pminor; + + return rec.prop; } #else @@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {} void CudaCompNode::set_prealloc_config( size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} -size_t CudaCompNode::get_compute_capability(int dev) { - return 0; +CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) { + return CompNode::DeviceProperties{}; } #undef err diff --git a/src/core/impl/comp_node/cuda/comp_node.h b/src/core/impl/comp_node/cuda/comp_node.h index 4d3c9110..5199c4f8 100644 --- a/src/core/impl/comp_node/cuda/comp_node.h +++ b/src/core/impl/comp_node/cuda/comp_node.h @@ -31,7 +31,7 @@ public: static size_t get_device_count(bool warn = true); static Impl* load_cuda(const Locator& locator, const Locator& locator_logical); static void sync_all(); - static size_t get_compute_capability(int dev); + static DeviceProperties get_device_prop(int dev); static void set_prealloc_config( size_t alignment, size_t min_req, size_t max_overhead, diff --git a/src/core/include/megbrain/comp_node.h b/src/core/include/megbrain/comp_node.h index 787054a9..2f9c986e 100644 --- a/src/core/include/megbrain/comp_node.h +++ b/src/core/include/megbrain/comp_node.h @@ -80,6 +80,20 @@ public: static constexpr size_t NR_DEVICE_TYPE = static_cast(DeviceType::MAX_DEVICE_ID); + struct DeviceProperties { + DeviceProperties() { + name = "unspec"; + total_memory = major = minor = 0; + } + + std::string name; + size_t total_memory; + + //! for cuda + int major; + int minor; + }; + /*! * \brief an identifier to specify a computing node * @@ -301,10 +315,11 @@ public: MGE_WIN_DECLSPEC_FUC static void set_prealloc_config( size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, DeviceType device_type); + /*! - * \brief get compute capability of the specified device + * \brief get device property of the specified device */ - MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability( + MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop( int dev, DeviceType device_type); /* =================== synchronization ======================== */ diff --git a/src/megbrain_build_config.h.in b/src/megbrain_build_config.h.in index 2f198120..0ffcadfb 100644 --- a/src/megbrain_build_config.h.in +++ b/src/megbrain_build_config.h.in @@ -268,5 +268,6 @@ #endif #define GIT_FULL_HASH "@GIT_FULL_HASH@" +#define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@" #endif // _HEADER_MGB_BUILD_CONFIG