Browse Source

feat(mge): add warning message when mismatched cuda sm is detected

GitOrigin-RevId: f78c79eb06
release-1.10
Megvii Engine Team 3 years ago
parent
commit
1c2a323e78
8 changed files with 172 additions and 43 deletions
  1. +23
    -0
      imperative/python/megengine/__init__.py
  2. +8
    -4
      imperative/python/megengine/device.py
  3. +23
    -1
      imperative/python/src/common.cpp
  4. +3
    -4
      src/core/impl/comp_node/comp_node.cpp
  5. +96
    -31
      src/core/impl/comp_node/cuda/comp_node.cpp
  6. +1
    -1
      src/core/impl/comp_node/cuda/comp_node.h
  7. +17
    -2
      src/core/include/megbrain/comp_node.h
  8. +1
    -0
      src/megbrain_build_config.h.in

+ 23
- 0
imperative/python/megengine/__init__.py View File

@@ -8,6 +8,7 @@
# "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
import atexit import atexit
import ctypes import ctypes
import re
import os import os
import platform import platform
import sys import sys
@@ -89,6 +90,9 @@ if sys.platform == "win32":
from .core._imperative_rt.core2 import close as _close from .core._imperative_rt.core2 import close as _close
from .core._imperative_rt.core2 import full_sync as _full_sync from .core._imperative_rt.core2 import full_sync as _full_sync
from .core._imperative_rt.core2 import sync as _sync from .core._imperative_rt.core2 import sync as _sync
from .core._imperative_rt.common import (
get_supported_sm_versions as _get_supported_sm_versions,
)
from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func from .core._imperative_rt.utils import _set_fork_exec_path_for_timed_func
from .config import * from .config import *
from .device import * from .device import *
@@ -99,6 +103,25 @@ from .utils import comp_graph_tools as cgtools
from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer from .utils.persistent_cache import PersistentCacheOnServer as _PersistentCacheOnServer
from .version import __version__ from .version import __version__



logger = get_logger(__name__)
ngpus = get_device_count("gpu")
supported_sm_versions = re.findall(r"sm_(\d+)", _get_supported_sm_versions())
for idx in range(ngpus):
prop = get_cuda_device_property(idx)
cur_sm = str(prop.major * 10 + prop.minor)
if not cur_sm in supported_sm_versions:
logger.warning(
"{} with CUDA capability sm_{} is not compatible with the current MegEngine installation. The current MegEngine install supports CUDA {} {}. If you want to use the {} with MegEngine, please check the instructions at https://github.com/MegEngine/MegEngine/blob/master/scripts/cmake-build/BUILD_README.md".format(
prop.name,
cur_sm,
"capabilities" if len(supported_sm_versions) > 1 else "capability",
" ".join(["sm_" + v for v in supported_sm_versions]),
prop.name,
)
)


_set_fork_exec_path_for_timed_func( _set_fork_exec_path_for_timed_func(
sys.executable, sys.executable,
os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"), os.path.join(os.path.dirname(__file__), "utils", "_timed_func_fork_exec_entry.py"),


+ 8
- 4
imperative/python/megengine/device.py View File

@@ -11,9 +11,7 @@ import re
from typing import Optional from typing import Optional


from .core._imperative_rt.common import CompNode, DeviceType from .core._imperative_rt.common import CompNode, DeviceType
from .core._imperative_rt.common import (
get_cuda_compute_capability as _get_cuda_compute_capability,
)
from .core._imperative_rt.common import get_device_prop as _get_device_prop
from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config from .core._imperative_rt.common import set_prealloc_config as _set_prealloc_config
from .core._imperative_rt.common import what_is_xpu as _what_is_xpu from .core._imperative_rt.common import what_is_xpu as _what_is_xpu
from .core._imperative_rt.utils import _try_coalesce_all_free_memory from .core._imperative_rt.utils import _try_coalesce_all_free_memory
@@ -25,6 +23,7 @@ __all__ = [
"set_default_device", "set_default_device",
"get_mem_status_bytes", "get_mem_status_bytes",
"get_cuda_compute_capability", "get_cuda_compute_capability",
"get_cuda_device_property",
"get_allocated_memory", "get_allocated_memory",
"get_reserved_memory", "get_reserved_memory",
"get_max_reserved_memory", "get_max_reserved_memory",
@@ -161,7 +160,12 @@ def get_cuda_compute_capability(device: int, device_type=DeviceType.CUDA) -> int
Returns: Returns:
a version number, or `SM version`. a version number, or `SM version`.
""" """
return _get_cuda_compute_capability(device, device_type)
prop = _get_device_prop(device, device_type)
return prop.major * 10 + prop.minor


def get_cuda_device_property(device: int, device_type=DeviceType.CUDA):
return _get_device_prop(device, device_type)




def get_allocated_memory(device: Optional[str] = None): def get_allocated_memory(device: Optional[str] = None):


+ 23
- 1
imperative/python/src/common.cpp View File

@@ -123,6 +123,23 @@ void init_common(py::module m) {


py::implicitly_convertible<std::string, CompNode>(); py::implicitly_convertible<std::string, CompNode>();


py::class_<CompNode::DeviceProperties>(m, "DeviceProperties")
.def(py::init())
.def_property_readonly(
"name",
[](const CompNode::DeviceProperties prop) { return prop.name; })
.def_property_readonly(
"total_memory",
[](const CompNode::DeviceProperties prop) {
return prop.total_memory;
})
.def_property_readonly(
"major",
[](const CompNode::DeviceProperties prop) { return prop.major; })
.def_property_readonly("minor", [](const CompNode::DeviceProperties prop) {
return prop.minor;
});

def_TensorND<DeviceTensorND>(m, "DeviceTensorND") def_TensorND<DeviceTensorND>(m, "DeviceTensorND")
.def("numpy", [](const DeviceTensorND& self) { .def("numpy", [](const DeviceTensorND& self) {
HostTensorND hv; HostTensorND hv;
@@ -223,7 +240,12 @@ void init_common(py::module m) {
m.def("set_prealloc_config", &CompNode::set_prealloc_config, m.def("set_prealloc_config", &CompNode::set_prealloc_config,
"specifies how to pre-allocate from raw dev allocator"); "specifies how to pre-allocate from raw dev allocator");


m.def("get_cuda_compute_capability", &CompNode::get_compute_capability);
m.def("get_device_prop", &CompNode::get_device_prop);

m.def("get_supported_sm_versions", []() {
static const char* mge_gen_code = MGE_CUDA_GENCODE;
return mge_gen_code;
});


m.def("what_is_xpu", m.def("what_is_xpu",
[] { return CompNode::Locator::parse("xpux").to_physical().type; }); [] { return CompNode::Locator::parse("xpux").to_physical().type; });


+ 3
- 4
src/core/impl/comp_node/comp_node.cpp View File

@@ -431,13 +431,12 @@ void CompNode::set_prealloc_config(
}; };
} }


size_t CompNode::get_compute_capability(int dev, DeviceType device_type) {
CompNode::DeviceProperties CompNode::get_device_prop(int dev, DeviceType device_type) {
switch (device_type) { switch (device_type) {
case DeviceType::CUDA: case DeviceType::CUDA:
return CudaCompNode::get_compute_capability(dev);
return CudaCompNode::get_device_prop(dev);
default: default:
mgb_log_warn("unsupport device type for get_compute_capability");
return 0;
mgb_throw(MegBrainError, "unsupport device type for get_device_prop");
}; };
} }




+ 96
- 31
src/core/impl/comp_node/cuda/comp_node.cpp View File

@@ -192,11 +192,11 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
//! return whether global finalized, and print warning in such case //! return whether global finalized, and print warning in such case
static inline bool check_global_finalized(); static inline bool check_global_finalized();


static CompNode::DeviceProperties get_device_prop(int dev);

//! enable peer copy from dev0 to dev1 //! enable peer copy from dev0 to dev1
static void enable_peer_access(int dev0, int dev1); static void enable_peer_access(int dev0, int dev1);


static size_t get_compute_capability(int dev);

static void static_free_device(ImplBase* self, void* ptr) { static void static_free_device(ImplBase* self, void* ptr) {
static_cast<CompNodeImpl*>(self)->free_device(ptr); static_cast<CompNodeImpl*>(self)->free_device(ptr);
} }
@@ -208,6 +208,8 @@ class CudaCompNode::CompNodeImpl final : public CompNode::Impl {
public: public:
CompNodeImpl() : Impl(static_free_device, static_free_host) {} CompNodeImpl() : Impl(static_free_device, static_free_host) {}


static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;

void* alloc_device(size_t size) override; void* alloc_device(size_t size) override;


void free_device(void* ptr); void free_device(void* ptr);
@@ -332,8 +334,6 @@ struct CudaCompNodeImpl::DeviceInfo {
}; };


struct CudaCompNodeImpl::StaticData { struct CudaCompNodeImpl::StaticData {
static constexpr int MAX_NR_COMP_NODE = 1024, MAX_NR_DEVICE = 64;

std::recursive_mutex mtx; std::recursive_mutex mtx;


mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config; mem_alloc::DevMemAlloc::PreAllocConfig prealloc_config;
@@ -376,6 +376,13 @@ struct CudaCompNodeImpl::StaticData {
CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr; CudaCompNodeImpl::StaticData* CudaCompNodeImpl::sd = nullptr;
Spinlock CudaCompNodeImpl::sd_mtx; Spinlock CudaCompNodeImpl::sd_mtx;


struct DevicePropRec {
bool init = false;
CompNode::DeviceProperties prop;
Spinlock mtx_com;
};
DevicePropRec device_prop_rec[CudaCompNodeImpl::MAX_NR_DEVICE];

void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) { void CudaCompNodeImpl::init(const Locator& locator, const Locator& locator_logical) {
m_locator = locator; m_locator = locator;
m_locator_logical = locator_logical; m_locator_logical = locator_logical;
@@ -564,7 +571,7 @@ void CudaCompNodeImpl::sync() {
} }


void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) { void CudaCompNodeImpl::enable_peer_access(int dev0, int dev1) {
static bool already_enabled[StaticData::MAX_NR_DEVICE][StaticData::MAX_NR_DEVICE];
static bool already_enabled[MAX_NR_DEVICE][MAX_NR_DEVICE];
if (already_enabled[dev0][dev1]) if (already_enabled[dev0][dev1])
return; return;


@@ -817,6 +824,52 @@ CUresult call_cuda_forksafe(Func func, Val* val, Args... args) {
return err; return err;
return err2; return err2;
} }
template <typename Func, typename... Args>
CUresult call_cuda_forksafe(Func func, char* val, int len, Args... args) {
auto err = func(val, len, args...);
if (err != CUDA_ERROR_NOT_INITIALIZED)
return err;
// cuInit not called, call it in child process
int fd[2];
mgb_assert(pipe(fd) == 0, "pipe() failed");
int fdr = fd[0], fdw = fd[1];
RAIICloseFD fdr_guard(fdr);
RAIICloseFD fdw_guard(fdw);
auto cpid = fork();
mgb_assert(cpid != -1, "fork() failed");
if (cpid == 0) {
fdr_guard.close();
do {
err = cuInit(0);
if (err != CUDA_SUCCESS)
break;
err = func(val, len, args...);
} while (0);
auto sz = write(fdw, &err, sizeof(err));
if (sz == sizeof(err) && err == CUDA_SUCCESS) {
sz = write(fdw, val, sizeof(*val) * len);
}
fdw_guard.close();
std::quick_exit(0);
}
fdw_guard.close();
auto sz = read(fdr, &err, sizeof(err));
mgb_assert(sz == sizeof(err), "failed to read error code from child");
if (err == CUDA_SUCCESS) {
sz = read(fdr, val, sizeof(*val) * len);
mgb_assert(
static_cast<size_t>(sz) == sizeof(*val) * static_cast<size_t>(len),
"failed to read value from child");
return err;
}
// try again, maybe another thread called cuInit while we fork
auto err2 = func(val, len, args...);
if (err2 == CUDA_SUCCESS)
return err2;
if (err2 == CUDA_ERROR_NOT_INITIALIZED)
return err;
return err2;
}
#endif #endif


const char* cu_get_error_string(CUresult err) { const char* cu_get_error_string(CUresult err) {
@@ -914,10 +967,12 @@ CompNode::Impl* CudaCompNode::load_cuda(
} }


if (!available_node) { if (!available_node) {
mgb_assert(sd.nr_node < sd.MAX_NR_COMP_NODE, "too many CompNode allocated");
mgb_assert(
sd.nr_node < CompNodeImpl::MAX_NR_COMP_NODE,
"too many CompNode allocated");
available_node = &sd.node[sd.nr_node++]; available_node = &sd.node[sd.nr_node++];
} }
mgb_assert(locator.device < sd.MAX_NR_DEVICE, "device number too large");
mgb_assert(locator.device < CompNodeImpl::MAX_NR_DEVICE, "device number too large");


mgb_assert(!available_node->m_initialized); mgb_assert(!available_node->m_initialized);
available_node->init(locator, locator_logical); available_node->init(locator, locator_logical);
@@ -1023,29 +1078,39 @@ void CudaCompNode::set_prealloc_config(
} }
} }


size_t CudaCompNode::get_compute_capability(int dev) {
size_t cnt = get_device_count();
if (dev < 0 || dev >= static_cast<int>(cnt)) {
mgb_log_error("request gpu %d out of valid range [0, %lu)", dev, cnt);
return 0;
}
static Spinlock mtx_com;
MGB_LOCK_GUARD(mtx_com);
int pmajor;
int pminor;
auto err = call_cuda_forksafe(
cuDeviceGetAttribute, &pmajor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
dev);
if (err != CUDA_SUCCESS) {
return 0;
}
auto err2 = call_cuda_forksafe(
cuDeviceGetAttribute, &pminor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
dev);
if (err2 != CUDA_SUCCESS) {
return 0;
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
int cnt = static_cast<int>(get_device_count());
mgb_assert(
dev >= 0 && dev < cnt, "request gpu %d out of valid range [0, %d)", dev,
cnt);

auto&& rec = device_prop_rec[dev];
if (!rec.init) {
MGB_LOCK_GUARD(rec.mtx_com);
if (!rec.init) {
char pname[256] = {0};
mgb_assert(
call_cuda_forksafe(
cuDeviceGetAttribute, &rec.prop.major,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(
cuDeviceGetAttribute, &rec.prop.minor,
CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
dev) == CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceGetName, pname, 255, dev) ==
CUDA_SUCCESS);
mgb_assert(
call_cuda_forksafe(cuDeviceTotalMem, &rec.prop.total_memory, dev) ==
CUDA_SUCCESS);
rec.prop.name = pname;
rec.init = true;
}
} }
return pmajor * 10 + pminor;

return rec.prop;
} }


#else #else
@@ -1067,8 +1132,8 @@ void CudaCompNode::sync_all() {}
void CudaCompNode::set_prealloc_config( void CudaCompNode::set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {} size_t alignment, size_t min_req, size_t max_overhead, double growth_factor) {}


size_t CudaCompNode::get_compute_capability(int dev) {
return 0;
CompNode::DeviceProperties CudaCompNode::get_device_prop(int dev) {
return CompNode::DeviceProperties{};
} }


#undef err #undef err


+ 1
- 1
src/core/impl/comp_node/cuda/comp_node.h View File

@@ -31,7 +31,7 @@ public:
static size_t get_device_count(bool warn = true); static size_t get_device_count(bool warn = true);
static Impl* load_cuda(const Locator& locator, const Locator& locator_logical); static Impl* load_cuda(const Locator& locator, const Locator& locator_logical);
static void sync_all(); static void sync_all();
static size_t get_compute_capability(int dev);
static DeviceProperties get_device_prop(int dev);


static void set_prealloc_config( static void set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, size_t alignment, size_t min_req, size_t max_overhead,


+ 17
- 2
src/core/include/megbrain/comp_node.h View File

@@ -80,6 +80,20 @@ public:
static constexpr size_t NR_DEVICE_TYPE = static constexpr size_t NR_DEVICE_TYPE =
static_cast<size_t>(DeviceType::MAX_DEVICE_ID); static_cast<size_t>(DeviceType::MAX_DEVICE_ID);


struct DeviceProperties {
DeviceProperties() {
name = "unspec";
total_memory = major = minor = 0;
}

std::string name;
size_t total_memory;

//! for cuda
int major;
int minor;
};

/*! /*!
* \brief an identifier to specify a computing node * \brief an identifier to specify a computing node
* *
@@ -301,10 +315,11 @@ public:
MGE_WIN_DECLSPEC_FUC static void set_prealloc_config( MGE_WIN_DECLSPEC_FUC static void set_prealloc_config(
size_t alignment, size_t min_req, size_t max_overhead, double growth_factor, size_t alignment, size_t min_req, size_t max_overhead, double growth_factor,
DeviceType device_type); DeviceType device_type);

/*! /*!
* \brief get compute capability of the specified device
* \brief get device property of the specified device
*/ */
MGE_WIN_DECLSPEC_FUC static size_t get_compute_capability(
MGE_WIN_DECLSPEC_FUC static DeviceProperties get_device_prop(
int dev, DeviceType device_type); int dev, DeviceType device_type);


/* =================== synchronization ======================== */ /* =================== synchronization ======================== */


+ 1
- 0
src/megbrain_build_config.h.in View File

@@ -268,5 +268,6 @@
#endif #endif


#define GIT_FULL_HASH "@GIT_FULL_HASH@" #define GIT_FULL_HASH "@GIT_FULL_HASH@"
#define MGE_CUDA_GENCODE "@MGE_CUDA_GENCODE@"


#endif // _HEADER_MGB_BUILD_CONFIG #endif // _HEADER_MGB_BUILD_CONFIG

Loading…
Cancel
Save