You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

megbrain_wrap.cpp 32 kB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055
  1. /**
  2. * \file python_module/src/cpp/megbrain_wrap.cpp
  3. *
  4. * This file is part of MegBrain, a deep learning framework developed by Megvii.
  5. *
  6. * \copyright Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  7. *
  8. */
  9. #include "./megbrain_wrap.h"
  10. #include "./python_helper.h"
  11. #include "./megbrain_pubapi_internal.h"
  12. #include "megbrain/version.h"
  13. #include "megbrain/tensor.h"
  14. #include "megbrain/comp_node_env.h"
  15. #include "megbrain/opr/io.h"
  16. #include "megbrain/opr/utility.h"
  17. #include "megbrain/gopt/inference.h"
  18. #include "megbrain/utils/thread.h"
  19. #include "megbrain/utils/timer.h"
  20. #include <cstring>
  21. using namespace mgb;
  22. namespace {
  23. bool g_global_finalize_called = false;
  24. /*!
  25. * \brief record the vars produced from user-created Host2DeviceCopy
  26. *
  27. * Note that the vars are mapped by address of underlying HostTensorND, so
  28. * in the case of partial execution, vars in the parent graph can be
  29. * retrieved from oprs in the sub graphs.
  30. */
  31. class UserInputVars final : public UserDataContainer::UserData {
  32. MGB_TYPEINFO_OBJ_DECL;
  33. //! we keep this mapping to handle multi-part compiling, where new
  34. //! graphs would be created and the var in the original graph is needed
  35. ThinHashMap<HostTensorND*, VarNode*> m_tensor2var;
  36. public:
  37. void register_var(SymbolVar x) {
  38. m_tensor2var[x.node()->owner_opr()
  39. ->cast_final_safe<opr::Host2DeviceCopy>()
  40. .host_data()
  41. .get()] = x.node();
  42. }
  43. //! get the corresponding var from an opr if it has been registered;
  44. //! return nullptr otherwise
  45. VarNode* check(cg::OperatorNodeBase* opr) const {
  46. if (opr->same_type<opr::Host2DeviceCopy>()) {
  47. auto ptr = opr->cast_final<opr::Host2DeviceCopy>()
  48. .host_data()
  49. .get();
  50. auto iter = m_tensor2var.find(ptr);
  51. return iter == m_tensor2var.end() ? nullptr : iter->second;
  52. }
  53. return nullptr;
  54. }
  55. static UserInputVars& get(ComputingGraph* graph) {
  56. return *graph->options()
  57. .user_data.get_user_data_or_create<UserInputVars>();
  58. }
  59. };
  60. __attribute__((constructor))
  61. void global_init() {
  62. CompNode::enable_affinity_for_cpu(true);
  63. }
  64. } // anonymous namespace
  65. MGB_TYPEINFO_OBJ_IMPL(UserInputVars);
  66. /* ================= SharedND ================= */
  67. bool SharedND::sync(mgb::DeviceTensorND &dv) {
  68. if (m_copy_sync) {
  69. dv.sync();
  70. return true;
  71. }
  72. return false;
  73. }
  74. void SharedND::_set_init_shape(const std::vector<size_t> &shape) {
  75. mgb_assert(m_dev_tensor && m_dev_tensor->empty());
  76. m_dev_tensor->resize(npy::vec2shape(shape));
  77. }
  78. void SharedND::_resize(const std::vector<size_t> &shape) {
  79. auto tshp = npy::vec2shape(shape);
  80. if (m_dev_tensor) {
  81. m_dev_tensor->resize(tshp);
  82. } else {
  83. mgb_assert(m_var);
  84. m_var->shape_alloc(tshp);
  85. }
  86. }
  87. void SharedND::_reset_zero() {
  88. fill_zero_dev_tensor(*m_dev_tensor);
  89. }
  90. void SharedND::_copy_from_npyarr(PyObject *npyarr) {
  91. auto do_copy = [&](DeviceTensorND *dest, VarNode *var) {
  92. DType dtype = dest ? dest->dtype() : var->dtype();
  93. mgb_assert(dtype.valid());
  94. auto hv = npy::np2tensor(npyarr, npy::Meth::borrow(), dtype);
  95. if (var) {
  96. // only setup by assign(), by craniotome
  97. var->shape_alloc(hv.shape());
  98. dest = &var->mutable_dev_tensor();
  99. }
  100. if (!sync(dest->copy_from(hv))) {
  101. m_async_copy_refkeeper = hv;
  102. } else {
  103. m_async_copy_refkeeper = {};
  104. }
  105. };
  106. if (m_var) {
  107. mgb_assert(!m_dev_tensor);
  108. do_copy(nullptr, m_var);
  109. } else {
  110. mgb_assert(m_dev_tensor);
  111. do_copy(m_dev_tensor.get(), nullptr);
  112. }
  113. }
  114. PyObject* SharedND::_get_npyarr() {
  115. mgb_assert(m_dev_tensor);
  116. if (m_dev_tensor->empty())
  117. Py_RETURN_NONE;
  118. HostTensorND hv;
  119. hv.comp_node(CompNode::default_cpu())
  120. .copy_from(*m_dev_tensor)
  121. .sync();
  122. return npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE);
  123. }
  124. PyObject* SharedND::_get_dtype() {
  125. mgb_assert(m_dev_tensor);
  126. return npy::dtype_mgb2np(m_dev_tensor->dtype());
  127. }
  128. void SharedND::_copy_from_value_proxy(CompGraphCallbackValueProxy &value) {
  129. if (value.eager_copy()) {
  130. mgb_log_warn("copy from eager-copied CompGraphCallbackValueProxy into"
  131. " SharedND; consider using callback_lazycopy; traceback:\n%s",
  132. PyStackExtracter::run().c_str());
  133. }
  134. if (m_var) {
  135. mgb_assert(!m_dev_tensor);
  136. auto &&src = value.dev_tensor();
  137. m_var->shape_alloc(src.shape()).
  138. mutable_dev_tensor().copy_from(src);
  139. } else {
  140. mgb_assert(m_dev_tensor);
  141. sync(m_dev_tensor->copy_from(value.dev_tensor()));
  142. }
  143. }
  144. void SharedND::_share_from_value_proxy(CompGraphCallbackValueProxy& value) {
  145. if (value.eager_copy()) {
  146. mgb_log_warn(
  147. "share value from eager-copied CompGraphCallbackValueProxy into"
  148. " SharedND; consider using callback_lazycopy; traceback:\n%s",
  149. PyStackExtracter::run().c_str());
  150. }
  151. if (m_var) {
  152. mgb_assert(!m_dev_tensor);
  153. m_var->reset_dev_tensor_from_tensor(value.dev_tensor());
  154. } else {
  155. mgb_assert(m_dev_tensor);
  156. *m_dev_tensor = value.dev_tensor();
  157. }
  158. }
  159. SharedND SharedND::_from_symvar(SymbolVar symvar) {
  160. auto opr = symvar.node()->owner_opr();
  161. if (auto vsnd = opr->try_cast_final<opr::VolatileSharedDeviceTensor>()) {
  162. return SharedND(vsnd->dev_data());
  163. }
  164. if (auto snd = opr->try_cast_final<opr::SharedDeviceTensor>()) {
  165. return SharedND(snd->dev_data());
  166. }
  167. mgb_throw(MegBrainError, "cannot convert from %s", opr->dyn_typeinfo()->name);
  168. }
  169. uintptr_t SharedND::_pubapi_dev_tensor_ptr(int version) {
  170. DeviceTensorND *dv;
  171. if (m_dev_tensor) {
  172. mgb_assert(!m_var);
  173. dv = m_dev_tensor.get();
  174. } else {
  175. mgb_assert(m_var);
  176. dv = nullptr;
  177. }
  178. void *ret;
  179. if (version == 0) {
  180. if (dv) {
  181. ret = dv->raw_ptr();
  182. } else {
  183. ret = m_var->dev_tensor().raw_ptr();
  184. }
  185. } else {
  186. init_pubapi_dev_tensor(m_pubapi_dev_tensor, dv, m_var, false);
  187. ret = &m_pubapi_dev_tensor;
  188. }
  189. return reinterpret_cast<uintptr_t>(ret);
  190. }
  191. SymbolVar SharedND::_as_sym_var(CompGraph &cg, const std::string &name,
  192. bool volatile_) {
  193. mgb_assert(m_dev_tensor);
  194. OperatorNodeConfig config;
  195. if (!name.empty())
  196. config.name(name);
  197. if (volatile_) {
  198. return opr::VolatileSharedDeviceTensor::make(cg.get(), m_dev_tensor,
  199. config);
  200. } else {
  201. return opr::SharedDeviceTensor::make(cg.get(), m_dev_tensor, config);
  202. }
  203. }
  204. std::vector<size_t> SharedND::_get_shape(){
  205. if (m_var) {
  206. mgb_assert(!m_dev_tensor);
  207. return npy::shape2vec(m_var->shape());
  208. }
  209. mgb_assert(m_dev_tensor);
  210. return npy::shape2vec(m_dev_tensor->shape());
  211. }
  212. void SharedND::copy_to_sub_from_shared(
  213. int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step,
  214. const SharedND &rhs) {
  215. mgb_assert(m_dev_tensor && rhs.m_dev_tensor);
  216. auto sub = m_dev_tensor->sub(
  217. Slice(begin, end, step).apply(m_dev_tensor->layout(), axis));
  218. sub.copy_from_fixlayout(*rhs.m_dev_tensor).sync();
  219. }
  220. void SharedND::copy_from_shared_sub(const SharedND &rhs,
  221. int axis, ptrdiff_t begin, ptrdiff_t end, ptrdiff_t step) {
  222. mgb_assert(m_dev_tensor && rhs.m_dev_tensor);
  223. if (axis == -3) {
  224. sync(m_dev_tensor->copy_from_fixlayout(*rhs.m_dev_tensor));
  225. } else if (axis == -2) {
  226. sync(m_dev_tensor->copy_from(*rhs.m_dev_tensor));
  227. } else {
  228. auto sub = rhs.m_dev_tensor->sub(
  229. Slice(begin, end, step).apply(
  230. rhs.m_dev_tensor->layout(), axis));
  231. sync(m_dev_tensor->copy_from(sub));
  232. }
  233. }
  234. void SharedND::_check_before_share_memory(const SharedND& rhs) {
  235. mgb_assert(rhs.m_dev_tensor);
  236. mgb_assert(m_dev_tensor);
  237. mgb_assert(rhs.m_dev_tensor->dtype() == m_dev_tensor->dtype());
  238. mgb_assert(rhs.m_dev_tensor->comp_node() == m_dev_tensor->comp_node());
  239. }
  240. void SharedND::_share_memory_from(const SharedND& rhs, size_t begin) {
  241. _check_before_share_memory(rhs);
  242. m_dev_tensor->reset(
  243. rhs.m_dev_tensor->storage().sub(m_dev_tensor->dtype().size() * begin),
  244. m_dev_tensor->layout());
  245. }
  246. void SharedND::_reset_dev_tensor(const SharedND &rhs) {
  247. _check_before_share_memory(rhs);
  248. *m_dev_tensor = *(rhs.m_dev_tensor);
  249. }
  250. /* ================= _HostSharedND ================= */
  251. void _HostSharedND::ensure_own_storage() {
  252. if (!m_own_storage) {
  253. mgb_assert(m_tensor);
  254. HostTensorND val{m_tensor->comp_node(), m_tensor->dtype()};
  255. if (!m_tensor->empty()) {
  256. val.resize(m_tensor->shape());
  257. }
  258. *m_tensor = std::move(val);
  259. m_own_storage = true;
  260. }
  261. }
  262. void _HostSharedND::_resize(const std::vector<size_t> &shape) {
  263. ensure_own_storage();
  264. m_tensor->resize(npy::vec2shape(shape));
  265. }
  266. void _HostSharedND::_copy_from_npyarr(PyObject *npyarr, bool borrow) {
  267. mgb_assert(m_tensor);
  268. mgb_assert(m_tensor->dtype().valid());
  269. if (!m_borrow_on_cpu &&
  270. m_tensor->comp_node().device_type() == CompNode::DeviceType::CPU) {
  271. borrow = false;
  272. }
  273. if (borrow) {
  274. auto val = npy::np2tensor(
  275. npyarr, npy::Meth::borrow(m_tensor->comp_node()),
  276. m_tensor->dtype());
  277. m_own_storage = false;
  278. *m_tensor = std::move(val);
  279. } else {
  280. ensure_own_storage();
  281. npy::np2tensor(npyarr,
  282. npy::Meth::copy_into(m_tensor.get()), m_tensor->dtype());
  283. }
  284. }
  285. SymbolVar _HostSharedND::_as_sym_var(CompGraph &cg, bool enable_static_infer,
  286. const std::string &name) {
  287. if (m_tensor->empty())
  288. cg.get().options().allocate_static_mem_after_graph_compile = false;
  289. OperatorNodeConfig config;
  290. if (!name.empty())
  291. config.name(name);
  292. SymbolVar ret;
  293. if (enable_static_infer) {
  294. ret = opr::Host2DeviceCopy::make(cg.get(), m_tensor, config);
  295. } else {
  296. ret = opr::Host2DeviceCopy::make_no_value_infer(cg.get(), m_tensor,
  297. config);
  298. }
  299. UserInputVars::get(&cg.get()).register_var(ret);
  300. return ret;
  301. }
  302. _HostSharedND _HostSharedND::make_proxy(SymbolVar var) {
  303. auto &&opr = var.node()->owner_opr()->
  304. cast_final_safe<opr::Host2DeviceCopy>();
  305. _HostSharedND rst{var.node()->comp_node(), var.dtype()};
  306. rst.m_tensor = opr.host_data();
  307. rst.m_proxied_opr = &opr;
  308. return rst;
  309. }
  310. std::string _HostSharedND::__repr__() const {
  311. if (m_proxied_opr) {
  312. return ssprintf("<HostSharedND proxy at %p for %s>",
  313. this, m_proxied_opr->cname());
  314. }
  315. return ssprintf("<HostSharedND at %p>", this);
  316. }
  317. PyObject* _HostSharedND::_get_dtype() {
  318. mgb_assert(m_tensor);
  319. return npy::dtype_mgb2np(m_tensor->dtype());
  320. }
  321. /* ================= CompGraphCallbackValueProxy ================= */
  322. CompGraphCallbackValueProxy
  323. CompGraphCallbackValueProxy::make_raw_host_value_proxy(
  324. const mgb::HostTensorND &hv) {
  325. CompGraphCallbackValueProxy ret;
  326. ret.m_use_raw_hv = true;
  327. ret.m_hv = hv;
  328. ret.m_is_active = true;
  329. return ret;
  330. }
  331. void CompGraphCallbackValueProxy::setup(
  332. const mgb::DeviceTensorND &val, bool eager_copy) {
  333. while (__atomic_load_n(&m_is_active, __ATOMIC_SEQ_CST)) {
  334. // wait for previous callback to finish
  335. std::this_thread::yield();
  336. }
  337. mgb_assert(!m_use_raw_hv && val.shape_valid());
  338. m_eager_copy = eager_copy;
  339. m_dev_value = val;
  340. if (eager_copy) {
  341. m_value_used = false;
  342. do_copy();
  343. } else {
  344. m_value_used = true;
  345. }
  346. __atomic_store_n(&m_is_active, true, __ATOMIC_SEQ_CST);
  347. }
  348. void CompGraphCallbackValueProxy::do_copy() {
  349. mgb_assert(!m_use_raw_hv && m_dev_value.shape_valid());
  350. m_hv.copy_from(m_dev_value);
  351. auto cn = m_hv.comp_node();
  352. if (!m_copy_event)
  353. m_copy_event = cn.create_event();
  354. m_copy_event->record();
  355. }
  356. #if defined(WIN32)
  357. #include <windows.h>
  358. #include <stdio.h>
  359. #undef CONST
  360. #define usleep Sleep
  361. #endif
  362. void CompGraphCallbackValueProxy::sync() const {
  363. mgb_assert(!m_use_raw_hv);
  364. RealTimer t0;
  365. double next_warn_time = 2, warn_time_delta = 1;
  366. while (!m_copy_event->finished()) {
  367. //! sleep 1ms or sleep 1us no difference for performance on win32
  368. usleep(1);
  369. if (t0.get_secs() >= next_warn_time) {
  370. mgb_log_warn("wait d2h copy for more than %.3f secs",
  371. t0.get_secs());
  372. next_warn_time += warn_time_delta;
  373. warn_time_delta += 1;
  374. }
  375. }
  376. }
  377. void CompGraphCallbackValueProxy::on_finished() {
  378. mgb_assert(m_is_active && !m_use_raw_hv);
  379. m_dev_value = {};
  380. if (m_hv.shape_valid()) {
  381. m_hv.resize({}); // resize to reuse buffer
  382. }
  383. __atomic_store_n(&m_is_active, false, __ATOMIC_SEQ_CST);
  384. if (!m_value_used) {
  385. mgb_log_warn("computing graph callback did not read the value");
  386. }
  387. }
  388. PyObject* CompGraphCallbackValueProxy::_get_npyarr() {
  389. mgb_assert(m_is_active);
  390. if (!m_use_raw_hv) {
  391. mgb_assert(m_dev_value.shape_valid());
  392. if (!m_hv.shape_valid()) {
  393. do_copy();
  394. sync();
  395. }
  396. }
  397. m_value_used = true;
  398. return npy::ndarray_from_tensor(m_hv, npy::ShareType::TRY_SHARE);
  399. }
  400. PyObject* CompGraphCallbackValueProxy::_get_dtype() {
  401. mgb_assert(m_is_active);
  402. if (m_use_raw_hv)
  403. return npy::dtype_mgb2np(m_hv.dtype());
  404. mgb_assert(m_dev_value.shape_valid());
  405. return npy::dtype_mgb2np(m_dev_value.dtype());
  406. }
  407. std::vector<size_t> CompGraphCallbackValueProxy::_get_shape() {
  408. mgb_assert(m_is_active);
  409. if (m_use_raw_hv)
  410. return npy::shape2vec(m_hv.shape());
  411. mgb_assert(m_dev_value.shape_valid());
  412. return npy::shape2vec(m_dev_value.shape());
  413. }
  414. uintptr_t CompGraphCallbackValueProxy::_pubapi_dev_tensor_ptr(int version) {
  415. mgb_assert(m_is_active && !m_use_raw_hv);
  416. mgb_assert(m_dev_value.shape_valid());
  417. void *ret;
  418. if (version == 0) {
  419. ret = m_dev_value.raw_ptr();
  420. } else {
  421. init_pubapi_dev_tensor(
  422. m_pubapi_dev_tensor, &m_dev_value, nullptr, true);
  423. ret = &m_pubapi_dev_tensor;
  424. }
  425. return reinterpret_cast<uintptr_t>(ret);
  426. }
  427. mgb::CompNode CompGraphCallbackValueProxy::_get_comp_node() {
  428. mgb_assert(m_is_active && !m_use_raw_hv);
  429. mgb_assert(m_dev_value.shape_valid());
  430. return m_dev_value.comp_node();
  431. }
  432. /* ================= AsyncExec ================= */
  433. class AsyncExec::Core {
  434. public:
  435. Core(std::unique_ptr<mgb::cg::AsyncExecutable> f):
  436. m_func(std::move(f))
  437. {
  438. }
  439. mgb::cg::AsyncExecutable* func() const {
  440. return m_func.get();
  441. }
  442. struct CallbackParam {
  443. std::vector<CompGraphCallbackValueProxy> value;
  444. _CompGraphCallback *cb;
  445. };
  446. void dispatch_callback(const CallbackParam &param) {
  447. m_worker.add_task(param);
  448. }
  449. void wait_callback_finish() {
  450. m_worker.wait_all_task_finish();
  451. }
  452. private:
  453. std::unique_ptr<mgb::cg::AsyncExecutable> m_func;
  454. class Worker final: public AsyncQueueSC<CallbackParam, Worker> {
  455. public:
  456. void process_one_task(const CallbackParam &task) {
  457. for (auto &tmp_value: task.value) {
  458. tmp_value.sync();
  459. }
  460. task.cb->call_pycb();
  461. }
  462. };
  463. Worker m_worker;
  464. };
  465. AsyncExec::AsyncExec(std::unique_ptr<mgb::cg::AsyncExecutable> f):
  466. m_core(std::make_shared<Core>(std::move(f)))
  467. {
  468. }
  469. AsyncExec::~AsyncExec() {
  470. if (m_core)
  471. _wait();
  472. }
  473. AsyncExec::Core* AsyncExec::core() const {
  474. return m_core.get();
  475. }
  476. void AsyncExec::_execute() {
  477. m_core->func()->execute();
  478. }
  479. std::string AsyncExec::_to_json_str() {
  480. auto jv = m_core->func()->to_json();
  481. return jv->to_string();
  482. }
  483. void AsyncExec::_wait() {
  484. m_core->wait_callback_finish();
  485. m_core->func()->wait();
  486. }
  487. double AsyncExec::_get_prev_exec_time() {
  488. return m_core->func()->get_prev_exec_time();
  489. }
  490. SymbolVarArray AsyncExec::_find_mutable_input() {
  491. ThinHashSet<VarNode*> used_set;
  492. UserInputVars* user_vars = nullptr;
  493. auto cb = [&](cg::OperatorNodeBase* opr) {
  494. if (!user_vars) {
  495. ComputingGraph* g;
  496. if (m_multi_part_par_graph)
  497. g = m_multi_part_par_graph.get();
  498. else
  499. g = opr->owner_graph();
  500. user_vars = &UserInputVars::get(g);
  501. }
  502. if (auto var = user_vars->check(opr)) {
  503. used_set.insert(var);
  504. }
  505. return true;
  506. };
  507. m_core->func()->iter_opr_seq(cb);
  508. for (auto i : m_core->func()->get_rt_static_source_deps()) {
  509. cb(i.dest->owner_opr());
  510. }
  511. SymbolVarArray ret;
  512. ret.reserve(used_set.size());
  513. ret.insert(ret.begin(), used_set.begin(), used_set.end());
  514. return ret;
  515. }
  516. void AsyncExec::clear_device_memory() {
  517. _wait();
  518. m_core->func()->clear_device_memory();
  519. }
  520. std::vector<std::pair<CompNode, size_t>>
  521. AsyncExec::_update_static_alloc_plan_and_get_size() {
  522. std::vector<std::pair<CompNode, size_t>> ret;
  523. for (auto&& i : m_core->func()->update_static_alloc_plan_and_get_size()) {
  524. ret.emplace_back(i.first, i.second);
  525. }
  526. return ret;
  527. }
  528. /* ================= _CompGraphCallback ================= */
  529. void _CompGraphCallback::set_async_exec(const AsyncExec &ae) {
  530. mgb_assert(!m_ae_core);
  531. m_ae_core = ae.core();
  532. }
  533. void _CompGraphCallback::set_eager_copy(bool flag) {
  534. mgb_assert(!m_cb_created);
  535. m_eager_copy = flag;
  536. }
  537. std::function<void(mgb::SmallVector<mgb::DeviceTensorND> &)> _CompGraphCallback::make_multi_input_callback() {
  538. mgb_assert(!m_cb_created);
  539. m_cb_created = true;
  540. // shared_ptr would delete this afterwards
  541. std::shared_ptr <_CompGraphCallback> self(this);
  542. auto cb = [self](SmallVector <mgb::DeviceTensorND> &data) {
  543. for (size_t i = self->m_value_proxies.size(); i < data.size(); ++i) {
  544. self->m_value_proxies.emplace_back();
  545. }
  546. if (self->m_eager_copy) {
  547. mgb_assert(self->m_ae_core);
  548. for (size_t i = 0; i < self->m_value_proxies.size(); ++i) {
  549. self->m_value_proxies[i].setup(data[i], true);
  550. }
  551. self->m_ae_core->dispatch_callback(
  552. AsyncExec::Core::CallbackParam{self->m_value_proxies, self.get()}
  553. );
  554. } else {
  555. for (size_t i = 0; i < self->m_value_proxies.size(); ++i)
  556. self->m_value_proxies[i].setup(data[i], false);
  557. self->call_pycb();
  558. }
  559. };
  560. return cb;
  561. }
  562. std::function<void(mgb::DeviceTensorND &)> _CompGraphCallback::make_callback() {
  563. this->m_value_proxies.emplace_back();
  564. mgb_assert(!m_cb_created);
  565. m_cb_created = true;
  566. // shared_ptr would delete this afterwards
  567. std::shared_ptr <_CompGraphCallback> self(this);
  568. auto cb = [self](mgb::DeviceTensorND &data) {
  569. if (self->m_eager_copy) {
  570. mgb_assert(self->m_ae_core);
  571. self->m_value_proxies[0].setup(data, true);
  572. self->m_ae_core->dispatch_callback(
  573. AsyncExec::Core::CallbackParam{self->m_value_proxies, self.get()}
  574. );
  575. } else {
  576. self->m_value_proxies[0].setup(data, false);
  577. self->call_pycb();
  578. }
  579. };
  580. return cb;
  581. }
  582. void _CompGraphCallback::call_pycb() {
  583. try {
  584. call(m_value_proxies);
  585. } catch (...) {
  586. for(auto &m_value_proxy: m_value_proxies) {
  587. m_value_proxy.on_finished();
  588. }
  589. throw;
  590. }
  591. for(auto &m_value_proxy: m_value_proxies) {
  592. m_value_proxy.on_finished();
  593. }
  594. }
  595. /* ================= CompGraph ================= */
  596. class CompGraph::PyUserData final: public UserDataContainer::UserData,
  597. public NonCopyableObj {
  598. MGB_TYPEINFO_OBJ_DECL;
  599. PyObject *m_obj;
  600. public:
  601. PyUserData() {
  602. PYTHON_GIL;
  603. m_obj = PyDict_New();
  604. mgb_assert(m_obj, "failed to create python object");
  605. }
  606. ~PyUserData() {
  607. PYTHON_GIL;
  608. Py_DECREF(m_obj);
  609. }
  610. PyObject* get() const {
  611. return m_obj;
  612. }
  613. };
  614. MGB_TYPEINFO_OBJ_IMPL(CompGraph::PyUserData);
  615. mgb::ComputingGraph& CompGraph::get() const {
  616. if (m_comp_graph_own)
  617. return *m_comp_graph_own;
  618. auto &&val = m_comp_graph_borrow.lock();
  619. mgb_assert(val, "CompGraph has been destructed");
  620. return *val;
  621. }
  622. void CompGraph::clear_device_memory() {
  623. if (!m_comp_graph_own)
  624. return;
  625. m_comp_graph_own->clear_device_memory();
  626. }
  627. PyObject* CompGraph::_user_data() {
  628. auto ct = get().options().user_data.get_user_data_or_create<PyUserData>();
  629. auto ret = ct->get();
  630. PYTHON_GIL;
  631. Py_INCREF(ret);
  632. return ret;
  633. }
  634. void CompGraph::_add_output_spec(
  635. mgb::cg::SymbolVar &var, _CompGraphCallback *callback) {
  636. cg::ComputingGraph::Callback cb;
  637. if (callback) {
  638. cb = callback->make_callback();
  639. m_raw_callbacks.push_back({callback, m_out_specs.size() - 1});
  640. }
  641. if (m_out_specs.empty()) {
  642. m_out_specs.emplace_back();
  643. }
  644. m_out_specs.back().push_back({var, cb});
  645. }
  646. AsyncExec CompGraph::_do_compile(bool copy, bool optimize_for_inference) {
  647. mgb_assert(m_out_specs.size() == 1, "got %zu output specs for compile",
  648. m_out_specs.size());
  649. auto&& spec = m_out_specs[0];
  650. if (optimize_for_inference) {
  651. SymbolVarArray vars;
  652. vars.reserve(spec.size());
  653. for (auto&& i : spec) {
  654. vars.push_back(i.first);
  655. }
  656. vars = gopt::optimize_for_inference(vars, {});
  657. mgb_assert(vars.size() == spec.size());
  658. for (size_t i = 0; i < vars.size(); ++i) {
  659. spec[i].first = vars[i];
  660. }
  661. }
  662. std::unique_ptr<mgb::cg::AsyncExecutable> async_executable;
  663. if (get().options().eager_evaluation ||
  664. (copy && get().current_comp_seq())) {
  665. // need to copy a new comp graph
  666. SymbolVarArray vars;
  667. vars.reserve(spec.size());
  668. for (auto&& i : spec) {
  669. vars.emplace_back(i.first);
  670. }
  671. // copy graph
  672. auto new_graph = mgb::ComputingGraph::make();
  673. SymbolVarArray new_vars =
  674. replace_vars_comp_graph(std::move(vars), new_graph.get());
  675. mgb_assert(new_vars.size() == spec.size());
  676. // register input
  677. auto h2d = find_h2d(new_vars);
  678. for (auto&& i : h2d) {
  679. UserInputVars::get(new_graph.get()).register_var(i);
  680. }
  681. mgb::ComputingGraph::OutputSpec new_spec;
  682. new_spec.reserve(spec.size());
  683. for (size_t i = 0; i < spec.size(); ++i) {
  684. new_spec.emplace_back(mgb::ComputingGraph::OutputSpecItem{
  685. new_vars[i], spec[i].second});
  686. }
  687. async_executable = new_graph->compile(new_spec);
  688. } else {
  689. async_executable = get().compile(spec);
  690. }
  691. AsyncExec ret{std::move(async_executable)};
  692. for (auto&& i : m_raw_callbacks) {
  693. mgb_assert(!i.second);
  694. i.first->set_async_exec(ret);
  695. }
  696. _clear_output_spec();
  697. return ret;
  698. }
  699. std::vector<AsyncExec> CompGraph::_do_compile_multi_part() {
  700. // last spec is empty due to an extra call to _add_multi_part_endpoint()
  701. mgb_assert(m_out_specs.size() > 1 && m_out_specs.back().empty(),
  702. "got %zu output specs for multi-part compile",
  703. m_out_specs.size());
  704. m_out_specs.pop_back();
  705. std::vector<AsyncExec> ret;
  706. ret.reserve(m_out_specs.size());
  707. auto graph = get().shared_from_this();
  708. for (auto&& i : graph->compile_multi_part(m_out_specs)) {
  709. ret.emplace_back(std::move(i));
  710. }
  711. for (auto&& i : ret) {
  712. i.set_multi_part_par_graph(graph);
  713. }
  714. for (auto&& i : m_raw_callbacks) {
  715. i.first->set_async_exec(ret.at(i.second));
  716. }
  717. _clear_output_spec();
  718. return ret;
  719. }
  720. /* ================= SharedScalar ================= */
  721. SharedScalar::SharedScalar(PyObject *val):
  722. m_val{std::make_shared<DTypeScalar>()}
  723. {
  724. _set(val);
  725. }
  726. HostTensorND& SharedScalar::val_as_host_nd() {
  727. if (m_val_as_host_nd.empty()) {
  728. HostTensorStorage storage;
  729. storage.reset(CompNode::default_cpu(), m_val->dtype().size(),
  730. {m_val, static_cast<dt_byte*>(
  731. const_cast<void*>(m_val->storage()))});
  732. m_val_as_host_nd.reset(storage, {TensorShape{1}, m_val->dtype()});
  733. }
  734. return m_val_as_host_nd;
  735. }
  736. void SharedScalar::_set(PyObject *val) {
  737. auto tensor = npy::np2tensor(val, npy::Meth::borrow(), {});
  738. mgb_assert(tensor.layout().is_scalar(),
  739. "value given to SharedScalar must be scalar; got shape %s",
  740. tensor.shape().to_string().c_str());
  741. if (m_dtype_locked) {
  742. mgb_assert(tensor.dtype() == m_val->dtype(),
  743. "dtype for SharedScalar has been locked as %s, "
  744. "but attempt to set it to %s", m_val->dtype().name(),
  745. tensor.dtype().name());
  746. }
  747. m_val->set_raw(tensor.dtype(), tensor.raw_ptr());
  748. if (!m_dev_val.empty()) {
  749. auto &&hv = val_as_host_nd();
  750. for (auto &&i: m_dev_val)
  751. i.second->copy_from_fixlayout(hv);
  752. }
  753. }
  754. PyObject* SharedScalar::_get() {
  755. HostTensorND hv{CompNode::default_cpu(), TensorShape{1}, m_val->dtype()};
  756. memcpy(hv.raw_ptr(), m_val->storage(), m_val->dtype().size(1));
  757. return npy::ndarray_from_tensor(hv, npy::ShareType::TRY_SHARE);
  758. }
  759. SymbolVar SharedScalar::_as_sym_var(CompGraph &cg, mgb::CompNode &cn) {
  760. m_dtype_locked = true;
  761. auto &&dv = m_dev_val[cn];
  762. auto &&hv = val_as_host_nd();
  763. if (!dv) {
  764. dv = std::make_shared<DeviceTensorND>(cn);
  765. dv->copy_from(hv);
  766. }
  767. return opr::SharedDeviceTensor::make(cg.get(), dv,
  768. ssprintf("SharedScalar@%p", m_val.get()));
  769. }
  770. /* =============== Operator =============== */
  771. const std::unique_ptr<mgb::OprFootprint> Operator::sm_opr_footprint_ptr{
  772. std::make_unique<mgb::OprFootprint>()};
  773. /* ================= misc ================= */
  774. SymbolVar fill_retain_dtype(SymbolVar var, PyObject *value) {
  775. auto tensor = npy::np2tensor(value, npy::Meth::borrow(), {});
  776. mgb_assert(tensor.shape().is_scalar(),
  777. "value for fill_retain_dtype must be scalar; got shape %s",
  778. tensor.shape().to_string().c_str());
  779. switch (tensor.dtype().enumv()) {
  780. #define cb(_dt) case DTypeTrait<_dt>::enumv: \
  781. static_assert(sizeof(DTypeTrait<_dt>::ctype) <= sizeof(int), \
  782. "bad dtype size"); \
  783. return var.fill_retain_dtype(static_cast<int>( \
  784. *tensor.ptr<DTypeTrait<_dt>::ctype>()));
  785. MEGDNN_FOREACH_COMPUTING_DTYPE_INT(cb)
  786. #undef cb
  787. case DTypeEnum::Float32:
  788. return var.fill_retain_dtype(*tensor.ptr<dt_float32>());
  789. case DTypeEnum::Float16:
  790. return var.fill_retain_dtype(
  791. static_cast<float>(*tensor.ptr<dt_float16>()));
  792. case DTypeEnum::BFloat16:
  793. return var.fill_retain_dtype(
  794. static_cast<float>(*tensor.ptr<dt_bfloat16>()));
  795. // TODO: What does this mean?
  796. case DTypeEnum::Quantized8Asymm:
  797. case DTypeEnum::QuantizedS32:
  798. case DTypeEnum::QuantizedS8:
  799. case DTypeEnum::Quantized4Asymm:
  800. case DTypeEnum::QuantizedS4:
  801. case DTypeEnum::Byte:
  802. case DTypeEnum::QuantizedS16:
  803. case DTypeEnum::Bool:
  804. break;
  805. #define cb(low_bit, size) \
  806. case DTypeEnum::low_bit##size: \
  807. break;
  808. MEGDNN_FOREACH_LOWBIT_DTYPE(cb)
  809. #undef cb
  810. }
  811. throw ConversionError(ssprintf(
  812. "unsupported value dtype: %s", tensor.dtype().name()));
  813. }
  814. PyObject* get_symvar_inferred_value(mgb::SymbolVar symvar) {
  815. auto var = symvar.node();
  816. auto&& mgr = var->owner_graph()->static_infer_manager();
  817. using IT = cg::static_infer::InferType;
  818. auto it = mgr.get_infer_type(var);
  819. if (!(it.value & (IT::CONST | IT::RT_STATIC)))
  820. Py_RETURN_NONE;
  821. auto val = mgr.infer_value_fallible(var);
  822. if (!val)
  823. Py_RETURN_NONE;
  824. auto hv = HostTensorND::make_proxy(*val);
  825. return npy::ndarray_from_tensor(hv, npy::ShareType::MUST_UNSHARE);
  826. }
  827. void _mgb_global_finalize() {
  828. CompNode::finalize();
  829. g_global_finalize_called = true;
  830. }
  831. bool global_finalized() {
  832. return g_global_finalize_called;
  833. }
  834. std::vector<size_t> _get_mgb_version() {
  835. return {MGB_MAJOR, MGB_MINOR, MGB_PATCH, MGB_IS_DEV};
  836. }
  837. SymbolVarArray _grad(SymbolVar target, SymbolVarArray wrts,
  838. bool warn_mid_wrt, int use_virtual_grad,
  839. bool return_zero_for_nodep) {
  840. if (use_virtual_grad == -1) {
  841. use_virtual_grad = std::abs(
  842. target.node()->owner_graph()->options().graph_opt_level) >= 2;
  843. }
  844. if (use_virtual_grad) {
  845. mgb_assert(return_zero_for_nodep,
  846. "can't return a null var when using virtual grad opr");
  847. SymbolVarArray ret;
  848. ret.reserve(wrts.size());
  849. for (auto&& wrt : wrts) {
  850. ret.push_back(opr::VirtualGrad::make(target, wrt));
  851. }
  852. return ret;
  853. }
  854. return cg::grad(target, wrts, warn_mid_wrt, return_zero_for_nodep);
  855. }
  856. SymbolVar _inter_graph_trans_var(
  857. CompGraph &dest_graph, SymbolVar src) {
  858. auto &&graph = dest_graph.get();
  859. auto trans = mgb::cg::InterGraphVarTransformer::get(graph);
  860. mgb_assert(trans, "trans func on graph %p has not been setup", &graph);
  861. return trans->trans(src.node());
  862. }
  863. SymbolVar _get_graph_optimizer_replaced_var(SymbolVar src) {
  864. return gopt::GraphOptimizer::var_replace_lookup(src.node());
  865. }
  866. void mark_as_input(ComputingGraph* cg, SymbolVar var) {
  867. VarNode* node = var.node();
  868. mgb_assert(node->owner_graph() == cg);
  869. mgb_assert(node->owner_opr()->same_type<opr::Host2DeviceCopy>());
  870. UserInputVars::get(cg).register_var(var);
  871. }
  872. namespace {
  873. void add_update_impl(const DeviceTensorND& dest,
  874. const DeviceTensorND& delta_nobrd,
  875. float alpha, float beta, float bias) {
  876. auto&& cn = dest.comp_node();
  877. using DT = CompNode::DeviceType;
  878. mgb_assert(cn == delta_nobrd.comp_node() &&
  879. (cn.device_type() == DT::CUDA || cn.device_type() == DT::CPU ||
  880. cn.device_type() == DT::ROCM));
  881. mgb_assert(dest.dtype() == delta_nobrd.dtype());
  882. auto&& delta = delta_nobrd.sub(SubTensorSpec::make_from_offset_elem(
  883. delta_nobrd.layout().broadcast(dest.shape()), 0));
  884. cn.activate();
  885. if (!static_cast<bool>(alpha) && beta == 1 &&
  886. !static_cast<bool>(bias)) {
  887. dest.copy_from_fixlayout(delta);
  888. } else {
  889. auto&& handle = MegDNNHandle::get(
  890. CompNodeEnv::from_comp_node(cn)).handle();
  891. auto&& op = handle->create_operator<megdnn::AddUpdate>();
  892. op->param() = {alpha, beta, bias};
  893. op->exec(dest.as_megdnn(), delta.as_megdnn());
  894. if (cn.device_type() == DT::CPU && cn != CompNode::default_cpu()) {
  895. CompNodeEnv::from_comp_node(cn).cpu_env().dispatch(
  896. [p = op.release()] { delete p; }
  897. );
  898. }
  899. }
  900. }
  901. } // anonymous namespace
  902. void _add_update_fastpath(SharedND& dest_, SharedND& delta_,
  903. float alpha, float beta, float bias) {
  904. auto&& dest = dest_.dev_tensor();
  905. auto&& delta = delta_.dev_tensor();
  906. add_update_impl(*dest, *delta, alpha, beta, bias);
  907. }
  908. void _add_update_fastpath(SharedND& dest_, CompGraphCallbackValueProxy& delta_,
  909. float alpha, float beta, float bias) {
  910. auto&& dest = dest_.dev_tensor();
  911. auto&& delta = delta_.dev_tensor();
  912. add_update_impl(*dest, delta, alpha, beta, bias);
  913. }
  914. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台