You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

cg.h 25 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656
  1. /**
  2. * \file src/core/include/megbrain/graph/cg.h
  3. * MegEngine is Licensed under the Apache License, Version 2.0 (the "License")
  4. *
  5. * Copyright (c) 2014-2020 Megvii Inc. All rights reserved.
  6. *
  7. * Unless required by applicable law or agreed to in writing,
  8. * software distributed under the License is distributed on an
  9. * "AS IS" BASIS, WITHOUT ARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  10. */
  11. #pragma once
  12. #include "megbrain/graph/operator_node.h"
  13. #include "megbrain/graph/symbol_var.h"
  14. #include "megbrain/graph/static_infer.h"
  15. #include "megbrain/graph/seq_comp_node_opt.h"
  16. #include "megbrain/utils/event.h"
  17. #include "megbrain/system.h"
  18. #if MGB_ENABLE_JSON
  19. #include "megbrain/utils/json.h"
  20. #endif
  21. namespace mgb {
  22. namespace cg {
  23. /*!
  24. * \brief allocation strategy for device storage in computing graphs
  25. *
  26. * Note: all the \p graph params would be NULL for requests originating from
  27. * ComputingGraph::prealloc_static_storage. Otherwise they are not NULL.
  28. *
  29. * This base class already provides an implementation using memory management on
  30. * the comp node. Sub-classes can override only the methods of interest.
  31. */
  32. class DeviceMemoryAllocator {
  33. public:
  34. //! a version sentinel value that should never be returned by
  35. //! static_alloc_version()
  36. static constexpr size_t VERSION_INVALID = ~static_cast<size_t>(0);
  37. virtual ~DeviceMemoryAllocator() = default;
  38. /*!
  39. * \brief implement the allocation strategy for static graph-wise storage
  40. * \param[in] graph the computing graph that requests the memory
  41. * \param[out] dest output tensor storage; its comp node has been
  42. * initialized to target comp node
  43. */
  44. virtual void alloc_static(ComputingGraph* graph, DeviceTensorStorage& dest,
  45. size_t size);
  46. /*!
  47. * \brief implement the allocation strategy for dynamic storage of a
  48. * variable
  49. * \param[in] var the variable that needs memory
  50. *
  51. * Note: if allocation fails, MemAllocError should be raised so
  52. * VarDevMemDefragmenter can catch the error and do defragmentation.
  53. */
  54. virtual void alloc_dynamic(VarNode* var, DeviceTensorStorage& dest,
  55. size_t size);
  56. /*!
  57. * \brief Ensure a contiguous storage for memory defragmenter
  58. *
  59. * When doing memory-defragmentation, it is useful to ensure that following
  60. * allocation requests can be placed in a contiguous storage. This function
  61. * would be called before calling alloc_dynamic() on the individual vars.
  62. */
  63. virtual void defrag_prealloc_contig(ComputingGraph* graph,
  64. CompNode comp_node, size_t size);
  65. /*!
  66. * \brief version of static allocation strategy
  67. *
  68. * If version changes before graph exec, static memory would be reallocated.
  69. * This function would be only called once in each graph execution.
  70. */
  71. virtual size_t static_alloc_version(ComputingGraph* graph) const;
  72. };
  73. /**
  74. * \brief common optimize options, it both can be used for optimize for
  75. * inference in graph dump but also used in graph optimization in runtime.
  76. */
  77. struct GraphCommonOptimizeOptions {
  78. //! whether to enable IO in float16 compute in float32
  79. bool f16_io_f32_comp = false;
  80. //! whether to enable tranform to pure float16 model
  81. bool f16_io_comp = false;
  82. //! whether to enable conv bias nonlinearity fusion
  83. bool fuse_conv_bias_nonlinearity = false;
  84. //! fuse pattern like ReLU(conv_bias(x, w, b) + z) or conv_bias(x, w, b)
  85. //! + z -> conv_bias(x, w, b, z)
  86. bool fuse_conv_bias_with_z = false;
  87. //! whether to enable weight preprocess, if enabled it may use more
  88. //! memory, default disable now, when weight preprocess is enabled, the
  89. //! input shape should no change
  90. bool weight_preprocess = false;
  91. //! fuse preprocess patten, like astype + pad_channel + dimshuffle
  92. bool fuse_preprocess = false;
  93. enum LayoutTransform : uint32_t {
  94. DEFAULT,
  95. NCHW4, ///< compute using NCHW4 tensor format
  96. NHWCD4, ///< compute using NHWCD4 tensor format
  97. NCHW88, ///< compute using NCHW88 tensor format
  98. NCHW44, ///< compute using NCHW44 tensor format
  99. NCHW44_DOT, ///< compute using NCHW44_DOT tensor format
  100. NCHW32, ///< compute using NCHW32 tensor format, used for
  101. ///< tensorcore
  102. CHWN4, ///< compute using CHWN4 tensor format, transformed mainly
  103. ///< used for cuda
  104. };
  105. LayoutTransform layout_transform = LayoutTransform::DEFAULT;
  106. #define SET(n) \
  107. GraphCommonOptimizeOptions& enable_##n() { \
  108. n = true; \
  109. return *this; \
  110. } \
  111. GraphCommonOptimizeOptions& disable_##n() { \
  112. n = false; \
  113. return *this; \
  114. } \
  115. bool has_set_##n() { return n == true; }
  116. SET(f16_io_f32_comp);
  117. SET(f16_io_comp);
  118. SET(fuse_conv_bias_nonlinearity);
  119. SET(fuse_conv_bias_with_z);
  120. SET(fuse_preprocess);
  121. SET(weight_preprocess);
  122. #undef SET
  123. #define SET(_trans, _trans_capital) \
  124. GraphCommonOptimizeOptions& enable_##_trans() { \
  125. mgb_assert(layout_transform == LayoutTransform::DEFAULT); \
  126. layout_transform = LayoutTransform::_trans_capital; \
  127. return *this; \
  128. } \
  129. GraphCommonOptimizeOptions& disable_##_trans() { \
  130. layout_transform = LayoutTransform::DEFAULT; \
  131. return *this; \
  132. } \
  133. bool has_set_##_trans() const { \
  134. return layout_transform == LayoutTransform::_trans_capital; \
  135. }
  136. SET(nchw4, NCHW4);
  137. SET(nhwcd4, NHWCD4);
  138. SET(nchw88, NCHW88);
  139. SET(nchw44, NCHW44);
  140. SET(nchw44_dot, NCHW44_DOT);
  141. SET(nchw32, NCHW32);
  142. SET(chwn4, CHWN4);
  143. #undef SET
  144. };
  145. /*!
  146. * \brief Computing graph.
  147. *
  148. * A computing graph manages operators and variables. It can be compiled to
  149. * create an AsyncExecutable that computs given variables.
  150. */
  151. class ComputingGraph : public std::enable_shared_from_this<ComputingGraph>,
  152. public CompNodeDepedentObject {
  153. public:
  154. ComputingGraph();
  155. virtual ~ComputingGraph() = default;
  156. /*!
  157. * \brief graph ID
  158. *
  159. * Each graph would be assigned a unique increasing ID; useful for
  160. * debugging
  161. */
  162. size_t id() const {
  163. return m_id;
  164. }
  165. virtual size_t next_node_id() = 0;
  166. static std::shared_ptr<ComputingGraph> make();
  167. //! assert that refcnt for ptr is one and destories the ptr
  168. static void assert_destroy(std::shared_ptr<ComputingGraph>& ptr);
  169. /*!
  170. * \brief callback to be invoked when some output is ready
  171. *
  172. * note that the output may be deallocated after the call returns if no
  173. * further node depends on the output
  174. */
  175. using Callback = thin_function<void(DeviceTensorND&)>;
  176. //! specify the callback of one output var
  177. using OutputSpecItem = std::pair<SymbolVar, Callback>;
  178. /*!
  179. * specified what ouptputs are required in compile(); the callback could
  180. * be empty, to ensure that the var is computed
  181. */
  182. using OutputSpec = std::vector<OutputSpecItem>;
  183. /*!
  184. * \brief information on how a var is needed by others
  185. */
  186. struct VarReceiverInfo;
  187. /*!
  188. * \brief generate an executable object that when executed, would call
  189. * the callbacks on the output values
  190. *
  191. * Also note that only the most recent compiled function could be used,
  192. * since oprs may have internal state
  193. */
  194. virtual std::unique_ptr<AsyncExecutable> compile(
  195. const OutputSpec &out_spec) = 0;
  196. /*!
  197. * \brief compile multiple graph parts for partial execution
  198. *
  199. * The parts in \p out_specs correspond to the execution steps of this
  200. * graph. The returned AsyncExecutable objects should be called in the
  201. * same order of parts given here.
  202. *
  203. * The created AsyncExecutable objects would belong to newly generated
  204. * graphs (not this graph). So functions compiled by compile() and
  205. * compile_multi_part() can co-exist. All the new graphs would share
  206. * device memory with this graph.
  207. */
  208. virtual SmallVector<std::unique_ptr<AsyncExecutable>>
  209. compile_multi_part(const SmallVector<OutputSpec>& out_specs) = 0;
  210. /*!
  211. * \brief insert a new operator node; its input must exist in current
  212. * graph
  213. * \return the node in the graph (maybe another node due to
  214. * deduplication)
  215. */
  216. virtual OperatorNodeBase* insert_opr(
  217. std::unique_ptr<OperatorNodeBase> opr) = 0;
  218. /*!
  219. * \brief used by OperatorNodeBase to allocate its outputs
  220. */
  221. template<typename... Args>
  222. VarNode* alloc_varnode(Args&&... args) {
  223. return new(alloc_varnode_storage()) VarNode(std::forward<Args>(args)...);
  224. }
  225. inline void free_varnode(VarNode* var) {
  226. var->~VarNode();
  227. free_varnode_storage(var);
  228. }
  229. protected:
  230. /*!
  231. * \brief provided by impl to support alloc_varnode
  232. */
  233. virtual void* alloc_varnode_storage() = 0;
  234. virtual void free_varnode_storage(void *ptr) = 0;
  235. public:
  236. /*!
  237. * \brief get current computing sequence
  238. */
  239. virtual AsyncExecutable* current_comp_seq() = 0;
  240. /*!
  241. * \brief get information on how a variable is needed in current comp
  242. * seq
  243. */
  244. virtual const VarReceiverInfo& var_receiver_in_current_comp_seq(
  245. const VarNode *var) const = 0;
  246. virtual std::string get_mem_allocation_info() const = 0;
  247. /*!
  248. * \brief find var node by its ID
  249. *
  250. * Note: this searches recursively in subgraphs, and its complexity is
  251. * linear with respect to number of vars (there is no indexing on var
  252. * node ID)
  253. *
  254. * \return VarNode pointer if it is found, or nullptr if no var is
  255. * found to have equal ID
  256. */
  257. virtual VarNode* find_var_by_id(size_t id) const = 0;
  258. /*!
  259. * \brief get underlying event connector
  260. */
  261. SyncEventConnecter& event() {
  262. return m_event;
  263. }
  264. const SyncEventConnecter& event() const {
  265. return m_event;
  266. }
  267. struct Options {
  268. //! attribute for a specific operator
  269. struct OprAttribute {
  270. #if MGB_ENABLE_SUBLINEAR
  271. /*!
  272. * if any opr is in this set, then the split of blocks can only
  273. * happen on those oprs.
  274. */
  275. ThinHashSet<OperatorNodeBase*>
  276. sublinear_memory_endpoint;
  277. bool get_sublinear_memory_endpoint(OperatorNodeBase *opr) const
  278. { return sublinear_memory_endpoint.count(opr); }
  279. #endif
  280. } opr_attribute;
  281. //! sequence compile optimization options
  282. struct SeqOpt {
  283. //! whether to enable memory forwarding to optimize mem plans
  284. bool enable_mem_plan_opt = true;
  285. //! whether to enable static memory reuse (i.e. using optimized
  286. //! static memory allocation algorithm)
  287. bool enable_mem_reuse_alloc = true;
  288. //! whether to enable comp node optimization (e.g. using copy
  289. //! stream for I/O operators)
  290. bool enable_seq_comp_node_opt = true;
  291. } seq_opt;
  292. //! graph optimization options
  293. struct GraphOpt : GraphCommonOptimizeOptions {
  294. //! whether to enable JIT; JIT would also be enabled at O3
  295. //! this value indicates JIT level: 1 for basic elemwise opr; 2
  296. //! for including reduce oprs
  297. uint8_t jit = 0;
  298. //! whether to enable fine-grained TensorRT opr replace
  299. bool tensorrt = false;
  300. } graph_opt;
  301. //! get attribute for an operator
  302. inline const OprAttribute& get_opr_attribute(
  303. OperatorNodeBase *opr) const;
  304. /*!
  305. * graph optimization level:
  306. * 0: disable
  307. * 1: level-1: inplace arith transformations during graph
  308. * construction
  309. * 2: level-2: level-1, plus global optimization before graph
  310. * compiling
  311. * 3: also enable JIT
  312. * <0: corresponding level, with result check for debug
  313. */
  314. int16_t graph_opt_level = 2;
  315. /*!
  316. * max size of allreduce packs in MB
  317. * set this option to zero to disable PackAllReducePass
  318. */
  319. int16_t allreduce_pack_max_size = 0;
  320. /*!
  321. * do not pack the first n allreduces
  322. * PackAllReducePass disabled if allreduce_pack_max_size is zero
  323. */
  324. int16_t allreduce_pack_ignore_first = 2;
  325. /*!
  326. * set logging level, larger number means more verbose
  327. * 0: no log info
  328. * 1: static memory allocation status
  329. * WorkspaceLimitGetter summary
  330. * optimizer summary
  331. * 2. optimizer var replace details during graph compiling
  332. * duplicated operator
  333. */
  334. uint16_t log_level = 1;
  335. /*!
  336. * async exec: dispatch on separate threads for different comp_node
  337. * 0: do not perform async dispatch
  338. * 1: dispatch async if there are more than one comp node with
  339. * limited queue
  340. * mask 0b10: async if there are multiple comp nodes with
  341. * mask 0b100: always async
  342. */
  343. uint16_t async_exec_level = 1;
  344. //! force dynamic memory alloc for all vars
  345. bool force_dynamic_alloc = false;
  346. //! whether to perform var sanity check on first run
  347. bool var_sanity_check_first_run = true;
  348. //! whether to allocate static memory just after compiling graph
  349. bool allocate_static_mem_after_graph_compile = false;
  350. /*!
  351. * whether only to perform non-computing tasks (like memory
  352. * allocation and queue initialization) for next exec. This would be
  353. * reset to false when the graph is executed.
  354. */
  355. bool fake_next_exec = false;
  356. //! whether to enable sublinear memory optimization
  357. bool enable_sublinear_memory_opt = false;
  358. //! Control parameter for sublinear memory optimization
  359. struct SublinearMemConfig {
  360. int thresh_nr_try = 10;
  361. int genetic_nr_iter = 0;
  362. int genetic_pool_size = 20;
  363. int lb_memory = 0;
  364. int num_worker = sys::get_cpu_count() / 2;
  365. } sublinear_mem_config;
  366. //! do not re-profile to select best impl algo when input shape
  367. //! changes (use previous algo)
  368. bool no_profiling_on_shape_change = false;
  369. //! whether to perform defragmenting when memory allocation for a
  370. //! dynamic var fails
  371. bool enable_var_mem_defragment = true;
  372. //! whether to reshape grad var whose wrt shape is statically
  373. //! inferrable but its own shape is dynamic
  374. bool enable_grad_var_static_reshape = false;
  375. /*!
  376. * whether to enable swap memory
  377. * as swap's performance is greatly worse than sublinear,
  378. * it is recommended to use sublinear first
  379. */
  380. bool enable_memory_swap = false;
  381. /*!
  382. * whether to use CompNodeSeqRecorder to record the execution
  383. * sequence and directly replay it for later executions.
  384. *
  385. * Level 1 is mainly used to speed up execution (especially for
  386. * opencl); level 2 is used for reducing memory usage.
  387. *
  388. * Level 1 constraints:
  389. * 1. All vars must be statically allocated
  390. * 2. Host input/output buffer pointers can not be changed if shape
  391. * is not changed (this is not checked in execution for
  392. * efficiency considerations; this is potentially dangerous)
  393. * 3. Synchronization can only occur at the end of execution
  394. * 4. Not all comp node implementations support recording computing
  395. * sequence
  396. * 5. Only one comp node can be used in the graph
  397. *
  398. * Level 2: besides recording the computing sequence, the
  399. * dependencies are also moved into the compiled func (see
  400. * GraphExecutable::ExecDependency). Additional constraints:
  401. * 1. Shapes can not change
  402. * 2. both fake_next_exec and var_sanity_check_first_run must be
  403. * disabled
  404. * 3. Var shapes must be correctly setup before calling compile()
  405. */
  406. uint8_t comp_node_seq_record_level = 0;
  407. #if !MGB_BUILD_SLIM_SERVING
  408. //! whether to evaulate var node values as they are inserted
  409. bool eager_evaluation = false;
  410. #endif
  411. bool imperative_proxy_graph = false;
  412. /*!
  413. * Request that operators should not force update their inputs.
  414. *
  415. * THIS FLAG IS RESERVED FOR INTERNAL USE
  416. *
  417. * When this flag is set, operators like AddUpdate and BatchNorm
  418. * will still attempt to inplace update their inputs, but failing
  419. * to do so will not be considered as an error.
  420. */
  421. bool no_force_inplace = false;
  422. //! add extra deps for the comp seq if a specific var is dependent
  423. ThinHashMap<VarNode*, VarNodeArray> extra_vardeps;
  424. //! contains any user data associated with this graph
  425. UserDataContainer user_data;
  426. }; // Options
  427. Options& options() {
  428. return m_options;
  429. }
  430. const Options& options() const {
  431. return m_options;
  432. }
  433. /*!
  434. * \brief get an instance for static var value infer manager
  435. */
  436. virtual static_infer::StaticInferManager& static_infer_manager() = 0;
  437. /*!
  438. * \brief get an instance for sequence computing node optimizer
  439. */
  440. virtual SeqCompNodeOptimizer& seq_comp_node_optimizer() = 0;
  441. /*!
  442. * \brief share static device memory with another computing graph
  443. *
  444. * To share memory for all graphs g[0..n-1], the correct way is to call
  445. * g[i].share_device_memory_with(g[0]) for i in range(1, n).
  446. *
  447. * This method must be called before compiling, and the user must ensure
  448. * AsyncExecutable objects with shared static device memory would not be
  449. * executed simultaneously.
  450. */
  451. virtual void share_device_memory_with(ComputingGraph &other) = 0;
  452. /*!
  453. * \brief set a custom DeviceMemoryAllocator to be used
  454. *
  455. * The given allocator would be used allocation in all graphs involved
  456. * in share_device_memory_with() calls related to this graph.
  457. */
  458. virtual void set_device_memory_allocator(
  459. std::shared_ptr<DeviceMemoryAllocator> allocator) = 0;
  460. /*!
  461. * \brief get size of currently allocated static device memory buffer on
  462. * given computing node
  463. * \return memory size in bytes
  464. */
  465. virtual size_t get_device_memory_size(CompNode cn) = 0;
  466. /*!
  467. * \brief clear statically allocated device memory
  468. * \return use count of device memory before clear; a value of 1
  469. * indicates the memory would be actually released
  470. */
  471. virtual size_t clear_device_memory() = 0;
  472. /*!
  473. * \brief set this graph as subgraph of another
  474. *
  475. * This mechanism is used to implement special control operators like
  476. * loop. Being a subgraph has following consequences:
  477. * 1. node ID counter would be shared
  478. * 2. when an AsyncExecutable compiled from subgraph are called, it
  479. * would not wait for previous run to finish; instead, when
  480. * AsyncExecutable from parent graph is being waited, it would call
  481. * wait() on AsyncExecutables from the subgraph.
  482. * 3. some options would be passed from parent graph to sub graph
  483. *
  484. * Note that reference to subgraph should be kept by its owner
  485. * operator, whose reference is kept by parent graph.
  486. */
  487. virtual void set_as_subgraph(ComputingGraph &par_graph) = 0;
  488. //! get number of operators inserted in this graph
  489. virtual size_t nr_oprs_in_graph() const = 0;
  490. #if !MGB_THREAD_SAFE
  491. /*!
  492. * \brief pre-allocate static storage used for internal states of
  493. * computing graphs
  494. *
  495. * This is mainly used to reduce memory usage in single-threaded
  496. * environments. If a newly compiled function requires larger memory
  497. * size than previous ones, megbrain has to re-allocate static storage
  498. * buffer and the previous buffers are all wasted (because they should
  499. * have been shared with the largest buffer).
  500. *
  501. * If we know the max buffer size for all functions, the buffer can be
  502. * pre-allocated so it can be shared by all.
  503. *
  504. * A common practice to call prealloc_static_storage(0) to get the
  505. * current buffer size at the end of the program, and use this value as
  506. * the buffer size in next run.
  507. *
  508. * \param size anticipated max size of all buffers, in bytes
  509. * \return current buffer size
  510. */
  511. static size_t prealloc_static_storage(size_t size);
  512. #endif
  513. /*!
  514. * \brief record given async error; it should call this function
  515. * rather than throw exception directly for the errors occurred
  516. * during calculation.
  517. */
  518. virtual void record_async_error(
  519. std::unique_ptr<MegBrainError> async_exc) = 0;
  520. private:
  521. SyncEventConnecter m_event;
  522. Options m_options;
  523. size_t m_id;
  524. };
  525. struct ComputingGraph::VarReceiverInfo {
  526. //! number of requests for directly computing by passing an empty callback
  527. size_t nr_direct_comp_req = 0;
  528. //! number of operators that need device value of this var
  529. size_t dev_value = 0;
  530. //! last dev value reader in the computing sequence
  531. OperatorNodeBase* last_dev_value_reader = nullptr;
  532. //! number of operators that need shape of this var, which can not be
  533. //! statically inferred
  534. size_t shape = 0;
  535. //! number of operators that need host value of this var, which can not be
  536. //! statically inferred
  537. size_t host_value = 0;
  538. //! number of operators in \p dev_value and \p host_value that allow this
  539. //! var to be empty
  540. size_t allow_empty_value = 0;
  541. //! whether nothing is needed completely
  542. bool empty() const {
  543. return !nr_direct_comp_req && !dev_value && !shape && !host_value;
  544. }
  545. //! whether computing value is needed (i.e. either dev_value, or shape, or
  546. //! host_value)
  547. bool value_needed() const {
  548. return dev_value || shape || host_value;
  549. }
  550. //! whether this var can be empty
  551. bool is_empty_allowed() const {
  552. return allow_empty_value == host_value + dev_value;
  553. }
  554. std::string to_string() const;
  555. };
  556. /*!
  557. * \brief helper function for creating an operator with unique output and
  558. * inserting it into graph
  559. */
  560. template<typename Node, typename ...Args>
  561. SymbolVar SymbolVar::insert_single_output_opr(Args &&...args) const {
  562. return m_node->owner_graph()->insert_opr(
  563. std::make_unique<Node>(std::forward<Args>(args)...))->output(0);
  564. }
  565. } // namespace cg
  566. } // namespace mgb
  567. // vim: syntax=cpp.doxygen foldmethod=marker foldmarker=f{{{,f}}}

MegEngine 安装包中集成了使用 GPU 运行代码所需的 CUDA 环境,不用区分 CPU 和 GPU 版。 如果想要运行 GPU 程序,请确保机器本身配有 GPU 硬件设备并安装好驱动。 如果你想体验在云端 GPU 算力平台进行深度学习开发的感觉,欢迎访问 MegStudio 平台