GitOrigin-RevId: f78c79eb06
release-1.10
@@ -8,6 +8,7 @@ | |||
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||
import atexit | |||
import ctypes | |||
import re | |||
import os | |||
import platform | |||
import sys | |||
@@ -89,6 +90,9 @@ if sys.platform == "win32": | |||
from .core._imperative_rt.core2 import close as _close | |||
from .core._imperative_rt.core2 import full_sync as _full_sync | |||
from .core._imperative_rt.core2 import sync as _sync | |||
from .core._imperative_rt.common import ( | |||
get_supported_sm_versions as _get_supported_sm_versions, | |||
) | |||
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func | |||
from .config import * | |||
from .device import * | |||
@@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools | |||
from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer | |||
from .version import __version__ | |||
logger = get_logger(__name__) | |||
ngpus = get_device_count("gpu") | |||
supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions()) | |||
for idx in range(ngpus): | |||
prop = get_cuda_device_property(idx) | |||
cur_sm = str(prop.major * 10 + prop.minor) | |||
if not cur_sm in supported_sm_versions: | |||
logger.warning( | |||
"{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format( | |||
prop.name, | |||
cur_sm, | |||
"capabilities" if len(supported_sm_versions) > 1 else "capability", | |||
" ".join(["sm_" + v for v in supported_sm_versions]), | |||
prop.name, | |||
) | |||
) | |||
_set_fork_exec_path_for_timed_func( | |||
sys.executable, | |||
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), | |||
@@ -11,9 +11,7 @@ import re | |||
from typing import Optional | |||
from .core._imperative_rt.common import CompNode, DeviceType | |||
from .core._imperative_rt.common import ( | |||
get_cuda_compute_capability as _get_cuda_compute_capability, | |||
) | |||
from .core._imperative_rt.common import get_device_prop as _get_device_prop | |||
from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config | |||
from .core._imperative_rt.common import what_is_xpu as _what_is_xpu | |||
from .core._imperative_rt.utils import _try_coalesce_all_free_memory | |||
@@ -25,6 +23,7 @@ __all__ = [ | |||
"set_default_device", | |||
"get_mem_status_bytes", | |||
"get_cuda_compute_capability", | |||
"get_cuda_device_property", | |||
"get_allocated_memory", | |||
"get_reserved_memory", | |||
"get_max_reserved_memory", | |||
@@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int | |||
Returns: | |||
a version number, or `SM version`. | |||
""" | |||
return _get_cuda_compute_capability(device, device_type) | |||
prop = _get_device_prop(device, device_type) | |||
return prop.major * 10 + prop.minor | |||
def get_cuda_device_property(device: int, device_type=DeviceType.CUDA): | |||
return _get_device_prop(device, device_type) | |||
def get_allocated_memory(device: Optional[str] = None): | |||
@@ -123,6 +123,23 @@ void init_common(py::module m) { | |||
py::implicitly_convertible<std::string, CompNode>(); | |||
py::class_<CompNode::DeviceProperties>(m, "DeviceProperties") | |||
.def(py::init()) | |||
.def_property_readonly( | |||
"name", | |||
[](const CompNode::DeviceProperties prop) { return prop.name; }) | |||
.def_property_readonly( | |||
"total_memory", | |||
[](const CompNode::DeviceProperties prop) { | |||
return prop.total_memory; | |||
}) | |||
.def_property_readonly( | |||
"major", | |||
[](const CompNode::DeviceProperties prop) { return prop.major; }) | |||
.def_property_readonly("minor", [](const CompNode::DeviceProperties prop) { | |||
return prop.minor; | |||
}); | |||
def_TensorND<DeviceTensorND>(m, "DeviceTensorND") | |||
.def("numpy", [](const DeviceTensorND& self) { | |||
HostTensorND hv; | |||
@@ -223,7 +240,12 @@ void init_common(py::module m) { | |||
m.def("set_prealloc_config", &CompNode::set_prealloc_config, | |||
"specifies how to pre-allocate from raw dev allocator"); | |||
m.def("get_cuda_compute_capability", &CompNode::get_compute_capability); | |||
m.def("get_device_prop", &CompNode::get_device_prop); | |||
m.def("get_supported_sm_versions", []() { | |||
static const char* mge_gen_code = MGE_CUDA_GENCODE; | |||
return mge_gen_code; | |||
}); | |||
m.def("what_is_xpu", | |||
[] { return CompNode::Locator::parse("xpux").to_physical().type; }); | |||
@@ -431,13 +431,12 @@ void CompNode::set_prealloc_config( | |||
}; | |||
} | |||
size_t CompNode::get_compute_capability(int dev, DeviceType device_type) { | |||
CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) { | |||
switch (device_type) { | |||
case DeviceType::CUDA: | |||
return CudaCompNode::get_compute_capability(dev); | |||
return CudaCompNode::get_device_prop(dev); | |||
default: | |||
mgb_log_warn("unsupport device type for get_compute_capability"); | |||
return 0; | |||
mgb_throw(MegBrainError, "unsupport device type for get_device_prop"); | |||
}; | |||
} | |||
@@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { | |||
//! return whether global finalized, and print warning in such case | |||
static inline bool check_global_finalized(); | |||
static CompNode::DeviceProperties get_device_prop(int dev); | |||
//! enable peer copy from dev0 to dev1 | |||
static void enable_peer_access(int dev0, int dev1); | |||
static size_t get_compute_capability(int dev); | |||
static void static_free_device(ImplBase* self, void* ptr) { | |||
static_cast<CompNodeImpl*>(self)->free_device(ptr); | |||
} | |||
@@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl { | |||
public: | |||
CompNodeImpl() : Impl(static_free_device, static_free_host) {} | |||
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; | |||
void* alloc_device(size_t size) override; | |||
void free_device(void* ptr); | |||
@@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo { | |||
}; | |||
struct CudaCompNodeImpl::StaticData { | |||
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64; | |||
std::recursive_mutex mtx; | |||
mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; | |||
@@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData { | |||
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr; | |||
Spinlock CudaCompNodeImpl::sd_mtx; | |||
struct DevicePropRec { | |||
bool init = false; | |||
CompNode::DeviceProperties prop; | |||
Spinlock mtx_com; | |||
}; | |||
DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE]; | |||
void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) { | |||
m_locator = locator; | |||
m_locator_logical = locator_logical; | |||
@@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() { | |||
} | |||
void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) { | |||
static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE]; | |||
static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE]; | |||
if (already_enabled[dev0][dev1]) | |||
return; | |||
@@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) { | |||
return err; | |||
return err2; | |||
} | |||
template <typename Func, typename... Args> | |||
CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) { | |||
auto err = func(val, len, args...); | |||
if (err != CUDA_ERROR_NOT_INITIALIZED) | |||
return err; | |||
// cuInit not called, call it in child process | |||
int fd[2]; | |||
mgb_assert(pipe(fd) == 0, "pipe() failed"); | |||
int fdr = fd[0], fdw = fd[1]; | |||
RAIICloseFD fdr_guard(fdr); | |||
RAIICloseFD fdw_guard(fdw); | |||
auto cpid = fork(); | |||
mgb_assert(cpid != -1, "fork() failed"); | |||
if (cpid == 0) { | |||
fdr_guard.close(); | |||
do { | |||
err = cuInit(0); | |||
if (err != CUDA_SUCCESS) | |||
break; | |||
err = func(val, len, args...); | |||
} while (0); | |||
auto sz = write(fdw, &err, sizeof(err)); | |||
if (sz == sizeof(err) && err == CUDA_SUCCESS) { | |||
sz = write(fdw, val, sizeof(*val) * len); | |||
} | |||
fdw_guard.close(); | |||
std::quick_exit(0); | |||
} | |||
fdw_guard.close(); | |||
auto sz = read(fdr, &err, sizeof(err)); | |||
mgb_assert(sz == sizeof(err), "failed to read error code from child"); | |||
if (err == CUDA_SUCCESS) { | |||
sz = read(fdr, val, sizeof(*val) * len); | |||
mgb_assert( | |||
static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len), | |||
"failed to read value from child"); | |||
return err; | |||
} | |||
// try again, maybe another thread called cuInit while we fork | |||
auto err2 = func(val, len, args...); | |||
if (err2 == CUDA_SUCCESS) | |||
return err2; | |||
if (err2 == CUDA_ERROR_NOT_INITIALIZED) | |||
return err; | |||
return err2; | |||
} | |||
#endif | |||
const char* cu_get_error_string(CUresult err) { | |||
@@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda( | |||
} | |||
if (!available_node) { | |||
mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated"); | |||
mgb_assert( | |||
sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE, | |||
"too many CompNode allocated"); | |||
available_node = &sd.node[sd.nr_node++]; | |||
} | |||
mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large"); | |||
mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large"); | |||
mgb_assert(!available_node->m_initialized); | |||
available_node->init(locator, locator_logical); | |||
@@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config( | |||
} | |||
} | |||
size_t CudaCompNode::get_compute_capability(int dev) { | |||
size_t cnt = get_device_count(); | |||
if (dev < 0 || dev >= static_cast<int>(cnt)) { | |||
mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt); | |||
return 0; | |||
} | |||
static Spinlock mtx_com; | |||
MGB_LOCK_GUARD(mtx_com); | |||
int pmajor; | |||
int pminor; | |||
auto err = call_cuda_forksafe( | |||
cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, | |||
dev); | |||
if (err != CUDA_SUCCESS) { | |||
return 0; | |||
} | |||
auto err2 = call_cuda_forksafe( | |||
cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, | |||
dev); | |||
if (err2 != CUDA_SUCCESS) { | |||
return 0; | |||
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) { | |||
int cnt = static_cast<int>(get_device_count()); | |||
mgb_assert( | |||
dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev, | |||
cnt); | |||
auto&& rec = device_prop_rec[dev]; | |||
if (!rec.init) { | |||
MGB_LOCK_GUARD(rec.mtx_com); | |||
if (!rec.init) { | |||
char pname[256] = {0}; | |||
mgb_assert( | |||
call_cuda_forksafe( | |||
cuDeviceGetAttribute, &rec.prop.major, | |||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, | |||
dev) == CUDA_SUCCESS); | |||
mgb_assert( | |||
call_cuda_forksafe( | |||
cuDeviceGetAttribute, &rec.prop.minor, | |||
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, | |||
dev) == CUDA_SUCCESS); | |||
mgb_assert( | |||
call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) == | |||
CUDA_SUCCESS); | |||
mgb_assert( | |||
call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) == | |||
CUDA_SUCCESS); | |||
rec.prop.name = pname; | |||
rec.init = true; | |||
} | |||
} | |||
return pmajor * 10 + pminor; | |||
return rec.prop; | |||
} | |||
#else | |||
@@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {} | |||
void CudaCompNode::set_prealloc_config( | |||
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} | |||
size_t CudaCompNode::get_compute_capability(int dev) { | |||
return 0; | |||
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) { | |||
return CompNode::DeviceProperties{}; | |||
} | |||
#undef err | |||
@@ -31,7 +31,7 @@ public: | |||
static size_t get_device_count(bool warn = true); | |||
static Impl* load_cuda(const Locator& locator, const Locator& locator_logical); | |||
static void sync_all(); | |||
static size_t get_compute_capability(int dev); | |||
static DeviceProperties get_device_prop(int dev); | |||
static void set_prealloc_config( | |||
size_t alignment, size_t min_req, size_t max_overhead, | |||
@@ -80,6 +80,20 @@ public: | |||
static constexpr size_t NR_DEVICE_TYPE = | |||
static_cast<size_t>(DeviceType::MAX_DEVICE_ID); | |||
struct DeviceProperties { | |||
DeviceProperties() { | |||
name = "unspec"; | |||
total_memory = major = minor = 0; | |||
} | |||
std::string name; | |||
size_t total_memory; | |||
//! for cuda | |||
int major; | |||
int minor; | |||
}; | |||
/*! | |||
* \brief an identifier to specify a computing node | |||
* | |||
@@ -301,10 +315,11 @@ public: | |||
MGE_WIN_DECLSPEC_FUC static void set_prealloc_config( | |||
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, | |||
DeviceType device_type); | |||
/*! | |||
* \brief get compute capability of the specified device | |||
* \brief get device property of the specified device | |||
*/ | |||
MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability( | |||
MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop( | |||
int dev, DeviceType device_type); | |||
/* =================== synchronization ======================== */ | |||
@@ -268,5 +268,6 @@ | |||
#endif | |||
#define GIT_FULL_HASH "@GIT_FULL_HASH@" | |||
#define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@" | |||
#endif // _HEADER_MGB_BUILD_CONFIG |