You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

debug.cpp 14 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454
  1. /**
  2. * \file src/core/impl/utils/debug.cpp
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or
  10. * implied.
  11. */
  12. #include "megbrain/utils/debug.h"
  13. #include <cerrno>
  14. #include <cmath>
  15. #include "megdnn/tensor_iter.h"
  16. using namespace mgb;
  17. using namespace debug;
  18. #if MGB_ENABLE_DEBUG_UTIL
  19. #include "megbrain/common.h"
  20. #include "megbrain/utils/metahelper.h"
  21. #include <cctype>
  22. #include <cstdio>
  23. #include <cstdlib>
  24. #include <cstring>
  25. #include <regex>
  26. #include "megbrain/utils/thin/function.h"
  27. #if MGB_CUDA
  28. #include <cuda.h>
  29. #include <cuda_runtime.h>
  30. #endif
  31. #include <pthread.h>
  32. #include <signal.h>
  33. #include <sys/types.h>
  34. #include <unistd.h>
  35. #ifdef __ANDROID__
  36. #include <unwind.h>
  37. #else
  38. #include <execinfo.h>
  39. #endif
  40. #ifdef __ANDROID__
  41. namespace {
  42. struct AndroidBacktraceState {
  43. void** current;
  44. void** end;
  45. };
  46. static _Unwind_Reason_Code android_unwind_callback(
  47. struct _Unwind_Context* context, void* arg) {
  48. AndroidBacktraceState* state = static_cast<AndroidBacktraceState*>(arg);
  49. void* current_pc = reinterpret_cast<void*>(_Unwind_GetIP(context));
  50. if (current_pc == nullptr)
  51. return _URC_NO_REASON;
  52. if (state->current == state->end) {
  53. return _URC_END_OF_STACK;
  54. } else {
  55. *state->current++ = current_pc;
  56. }
  57. return _URC_NO_REASON;
  58. }
  59. size_t backtrace(void** buffer, size_t max) {
  60. AndroidBacktraceState state = {buffer, buffer + max};
  61. _Unwind_Backtrace(android_unwind_callback, &state);
  62. return state.current - buffer;
  63. }
  64. } // anonymous namespace
  65. #endif // backtrace impl for __ANDROID__
  66. namespace {
  67. void throw_fork_cuda_exc() {
  68. mgb_throw(ForkAfterCudaError, "fork after cuda has been initialized");
  69. }
  70. struct MemmapEntry {
  71. uintptr_t low, high;
  72. std::string file;
  73. MemmapEntry(uint64_t low_, uint64_t high_, const char* file_)
  74. : low(low_), high(high_), file(file_) {}
  75. };
  76. void get_mem_map(
  77. int pid,
  78. thin_function<void(uintptr_t, uintptr_t, const char*, const char*)>
  79. callback) {
  80. char fpath[64];
  81. if (pid)
  82. sprintf(fpath, "/proc/%d/maps", pid);
  83. else
  84. strcpy(fpath, "/proc/self/maps");
  85. FILE* fin = fopen(fpath, "r");
  86. mgb_assert(fin, "failed to open %s", fpath);
  87. char linebuf[512];
  88. while (fgets(linebuf, sizeof(linebuf), fin)) {
  89. uintptr_t begin, end;
  90. char perm[10], offset[20], dev[10], inode[20], path_mem[256], *path;
  91. int nr = sscanf(linebuf, "%zx-%zx %s %s %s %s %s", &begin, &end, perm,
  92. offset, dev, inode, path_mem);
  93. if (nr == 6)
  94. path = nullptr;
  95. else {
  96. mgb_assert(nr == 7, "failed to parse map line: %s", linebuf);
  97. path = path_mem;
  98. }
  99. callback(begin, end, perm, path);
  100. }
  101. fclose(fin);
  102. }
  103. class SigHandlerInit {
  104. static void death_handler(int signum) {
  105. char msg0[] =
  106. "megbrain is about to die abruptly; you can set "
  107. "MGB_WAIT_TERMINATE and rerun to wait for gdb attach";
  108. if (MGB_GETENV("MGB_WAIT_TERMINATE")) {
  109. fprintf(stderr,
  110. "megbrain is about to die abruptly; you can gdb "
  111. "me at %d; wait for pressing enter\n",
  112. static_cast<int>(getpid()));
  113. getchar();
  114. }
  115. if (signum == -1) {
  116. mgb_log_error("%s: std::terminate() called", msg0);
  117. } else {
  118. mgb_log_error("%s: caught deadly signal %d(%s)", msg0, signum,
  119. strsignal(signum));
  120. }
  121. //FIXME: imp backtrace for macos
  122. #ifndef __APPLE__
  123. std::string bp;
  124. debug::backtrace(2).fmt_to_str(bp);
  125. mgb_log_error("%s", bp.c_str());
  126. #endif
  127. exit(EXIT_FAILURE);
  128. }
  129. public:
  130. static void init_for_segv() {
  131. struct sigaction action;
  132. memset(&action, 0, sizeof(action));
  133. action.sa_handler = &death_handler;
  134. sigaction(SIGSEGV, &action, nullptr);
  135. std::set_terminate([]() { death_handler(-1); });
  136. }
  137. };
  138. #if MGB_CUDA
  139. class CudaCheckOnFork {
  140. static int& flag() {
  141. static int ret = MGB_GETENV("MGB_THROW_ON_FORK") ? 2 : 1;
  142. return ret;
  143. }
  144. static void atfork_prepare() {
  145. if (flag() && !ScopedForkWarningSupress::supress()) {
  146. CUcontext ctx;
  147. if (cuCtxGetCurrent(&ctx) != CUDA_ERROR_NOT_INITIALIZED) {
  148. mgb_log_debug(
  149. "It is dangerous to call fork() after cuda "
  150. "context has been initialized; please ensure no cuda "
  151. "methods is invoked in the child process. You can set "
  152. "MGB_THROW_ON_FORK to find out where the fork() is "
  153. "called.");
  154. if (flag() > 1) {
  155. ForkAfterCudaError::throw_();
  156. }
  157. }
  158. }
  159. }
  160. public:
  161. static void set_flag(int f) { flag() = f; }
  162. static void init() {
  163. int err = pthread_atfork(&CudaCheckOnFork::atfork_prepare, nullptr,
  164. nullptr);
  165. if (err) {
  166. mgb_throw(SystemError, "failed to setup atfork handler: %s",
  167. strerror(err));
  168. }
  169. }
  170. };
  171. #endif
  172. class InitCaller {
  173. static InitCaller inst;
  174. InitCaller() {
  175. SigHandlerInit::init_for_segv();
  176. #if MGB_CUDA
  177. CudaCheckOnFork::init();
  178. #endif
  179. }
  180. };
  181. InitCaller InitCaller::inst;
  182. } // anonymous namespace
  183. void (*ForkAfterCudaError::throw_)() = throw_fork_cuda_exc;
  184. std::atomic_size_t ScopedForkWarningSupress::sm_depth{0};
  185. BacktraceResult mgb::debug::backtrace(int nr_exclude) {
  186. static bool thread_local recursive_call = false;
  187. if (recursive_call) {
  188. fprintf(stderr, "recursive call to backtrace()!\n");
  189. return {};
  190. }
  191. recursive_call = true;
  192. constexpr size_t MAX_DEPTH = 6;
  193. void* stack_mem[MAX_DEPTH];
  194. int depth = ::backtrace(stack_mem, MAX_DEPTH);
  195. auto stack = stack_mem;
  196. if (depth > nr_exclude) {
  197. depth -= nr_exclude;
  198. stack += nr_exclude;
  199. }
  200. static std::vector<MemmapEntry> memmap;
  201. if (memmap.empty()) {
  202. static std::mutex mtx;
  203. MGB_LOCK_GUARD(mtx);
  204. if (memmap.empty()) {
  205. get_mem_map(0, [&](uintptr_t lo, uintptr_t hi, const char* /*perm*/,
  206. const char* fname) {
  207. if (fname && strlen(fname))
  208. memmap.emplace_back(lo, hi, fname);
  209. });
  210. }
  211. }
  212. BacktraceResult result;
  213. for (int i = 0; i < depth; ++i) {
  214. const char* fname = nullptr;
  215. auto addr = reinterpret_cast<uintptr_t>(stack[i]);
  216. for (auto&& j : memmap)
  217. if (j.low <= addr && j.high >= addr) {
  218. // theoretically we should examine file content to find whether
  219. // it is a shared library; but who would name an executable with
  220. // .so ?
  221. if (j.file.find(".so") != std::string::npos)
  222. addr -= j.low;
  223. fname = j.file.c_str();
  224. break;
  225. }
  226. result.stack.emplace_back(fname, addr);
  227. }
  228. recursive_call = false;
  229. return result;
  230. }
  231. void BacktraceResult::fmt_to_str(std::string& dst) {
  232. char addr[128];
  233. bool first = true;
  234. const char* prev_fname = nullptr;
  235. dst.append("bt:");
  236. for (auto&& i : stack) {
  237. sprintf(addr, "%zx", i.second);
  238. if (i.first != prev_fname || first) {
  239. if (!first)
  240. dst.append("}");
  241. if (i.first)
  242. dst.append(i.first);
  243. else
  244. dst.append("unknown");
  245. prev_fname = i.first;
  246. first = false;
  247. dst.append("{");
  248. dst.append(addr);
  249. } else {
  250. dst.append(",");
  251. dst.append(addr);
  252. }
  253. }
  254. dst.append("}");
  255. }
  256. void debug::set_fork_cuda_warning_flag(int flag) {
  257. #if MGB_CUDA
  258. CudaCheckOnFork::set_flag(flag);
  259. #endif
  260. }
  261. #endif // MGB_ENABLE_DEBUG_UTIL
  262. namespace {
  263. bool good_float(float val) {
  264. return std::isfinite(val);
  265. }
  266. bool good_float(int) {
  267. return true;
  268. }
  269. #if MGB_ENABLE_LOGGING
  270. // if not in MGB_ENABLE_LOGGING, num2str would become defined but not used
  271. template <typename T>
  272. std::string num2str(T val) {
  273. return std::to_string(val);
  274. }
  275. std::string num2str(float val) {
  276. union V {
  277. uint32_t i;
  278. float f;
  279. };
  280. auto ret = std::to_string(val);
  281. if (!good_float(val)) {
  282. V v;
  283. v.f = val;
  284. ret.append(" (0x");
  285. ret.append(ssprintf("%x", v.i));
  286. ret.append(")");
  287. }
  288. return ret;
  289. }
  290. #endif
  291. template <typename ctype>
  292. Maybe<std::string> do_compare_tensor_value(const char* expr0, const char* expr1,
  293. const HostTensorND& v0,
  294. const HostTensorND& v1,
  295. float maxerr) {
  296. auto it0 = megdnn::tensor_iter<ctype>(v0.as_megdnn()).begin(),
  297. it1 = megdnn::tensor_iter<ctype>(v1.as_megdnn()).begin();
  298. for (size_t i = 0, it = v0.shape().total_nr_elems(); i < it; ++i) {
  299. ctype iv0 = *it0, iv1 = *it1;
  300. double err = std::abs(iv0 - iv1) /
  301. std::max<double>(
  302. 1, std::min(std::abs(static_cast<double>(iv0)),
  303. std::abs((static_cast<double>(iv1)))));
  304. if (!good_float(iv0) || !good_float(iv1) || err >= maxerr) {
  305. TensorShape idx_shp;
  306. idx_shp.ndim = v0.shape().ndim;
  307. std::copy(it0.idx(), it0.idx() + idx_shp.ndim, idx_shp.shape);
  308. return mgb_ssprintf_log(
  309. "Unequal value\n"
  310. "Value of: %s\n"
  311. " Actual: %s\n"
  312. "Expected: %s\n"
  313. "Which is: %s\n"
  314. "At index: %s/%s\n"
  315. " error: %.6g",
  316. expr1, num2str(iv1).c_str(), expr0, num2str(iv0).c_str(),
  317. idx_shp.to_string().c_str(), v0.shape().to_string().c_str(),
  318. err);
  319. }
  320. ++it0;
  321. ++it1;
  322. }
  323. return None;
  324. }
  325. } // anonymous namespace
  326. Maybe<std::string> debug::compare_tensor_value(const HostTensorND& v0,
  327. const char* expr0,
  328. const HostTensorND& v1,
  329. const char* expr1,
  330. float maxerr) {
  331. if (!v0.shape().eq_shape(v1.shape())) {
  332. return mgb_ssprintf_log(
  333. "Shape mismatch\n"
  334. "Value of: %s\n"
  335. " Actual: %s\n"
  336. "Expected: %s\n"
  337. "Which is: %s",
  338. expr1, v1.shape().to_string().c_str(), expr0,
  339. v0.shape().to_string().c_str());
  340. }
  341. auto dtype = v0.layout().dtype;
  342. if (dtype != v1.layout().dtype) {
  343. return mgb_ssprintf_log(
  344. "Data type mismatch\n"
  345. "Value of: %s\n"
  346. " Actual: %s\n"
  347. "Expected: %s\n"
  348. "Which is: %s",
  349. expr1, v1.layout().dtype.name(), expr0,
  350. v0.layout().dtype.name());
  351. }
  352. switch (dtype.enumv()) {
  353. #define cb(_dt) \
  354. case DTypeTrait<_dt>::enumv: \
  355. return do_compare_tensor_value<DTypeTrait<_dt>::ctype>( \
  356. expr0, expr1, v0, v1, maxerr);
  357. MEGDNN_FOREACH_COMPUTING_DTYPE(cb)
  358. #undef cb
  359. default:
  360. mgb_throw(MegBrainError, "unhandled dtype: %s", dtype.name());
  361. }
  362. }
  363. std::string debug::dump_tensor(const HostTensorND& value,
  364. const std::string& name) {
  365. struct Header {
  366. uint32_t name_len;
  367. uint32_t dtype;
  368. uint32_t max_ndim;
  369. uint32_t shape[TensorShape::MAX_NDIM];
  370. char name[0];
  371. };
  372. mgb_assert(value.layout().is_contiguous());
  373. auto value_bytes = value.layout().span().dist_byte();
  374. std::string ret(name.size() + value_bytes + sizeof(Header), '\0');
  375. auto header = reinterpret_cast<Header*>(&ret[0]);
  376. memset(header, 0, sizeof(Header));
  377. header->name_len = name.length();
  378. header->dtype = static_cast<uint32_t>(value.dtype().enumv());
  379. header->max_ndim = TensorShape::MAX_NDIM;
  380. for (size_t i = 0; i < value.layout().ndim; ++i) {
  381. header->shape[i] = value.layout()[i];
  382. }
  383. memcpy(header->name, name.c_str(), header->name_len);
  384. memcpy(header->name + name.size(), value.raw_ptr(), value_bytes);
  385. return ret;
  386. }
  387. void debug::write_to_file(const char* filename, const std::string& content,
  388. const char* mode) {
  389. FILE* fout = fopen(filename, mode);
  390. mgb_throw_if(!fout, SystemError, "failed to open %s: %s", filename,
  391. strerror(errno));
  392. auto nr = fwrite(content.data(), 1, content.size(), fout);
  393. mgb_throw_if(nr != content.size(), SystemError,
  394. "failed to write to %s: num=%zu size=%zu %s", filename, nr,
  395. content.size(), strerror(errno));
  396. auto err = fclose(fout);
  397. mgb_throw_if(err, SystemError, "failed to close %s: %s", filename,
  398. strerror(errno));
  399. }
  400. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台