You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

general.h 49 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449
  1. #pragma once
  2. #include "megdnn/internal/opr_header_prologue.h"
  3. #include "megdnn/thin/small_vector.h"
  4. namespace megdnn {
  5. /*!
  6. * \brief standard element-wise operator
  7. *
  8. * Inputs must have same dtype, and their shapes must broadcastable into a final
  9. * shape. They can have arbitrary layouts, but non-contiguous and non-broadcast
  10. * layouts may harm performance seriously.
  11. *
  12. * Output dtype is the same as input dtype (note that even for compare oprs this
  13. * is true, e.g. float == float returns value of float). Output layout must be
  14. * contiguous.
  15. */
  16. class ElemwiseForward : public OperatorBase {
  17. DEF_OPR_PARAM(Elemwise);
  18. DEF_OPR_IMPL(ElemwiseForward, OperatorBase, -1, 1);
  19. public:
  20. using Mode = Param::Mode;
  21. //! information about a mode
  22. struct ModeTrait {
  23. uint32_t arity; //!< number of inputs needed
  24. bool commutable; //!< whether arity == 2 and inputs commutable
  25. bool allow_int; //!< whether int inputs allowed
  26. bool allow_float; //!< whether float inputs allowed
  27. bool allow_bool; //!< whether bool inputs allowed
  28. const char* name; //!< name of the mode
  29. ModeTrait()
  30. : arity(0),
  31. commutable(0),
  32. allow_int(0),
  33. allow_float(0),
  34. allow_bool(0),
  35. name(NULL) {}
  36. //! get trait from a mode; this function is thread safe
  37. MGE_WIN_DECLSPEC_FUC static const ModeTrait& from_mode(Mode mode);
  38. };
  39. //! get trait of current mode
  40. const ModeTrait& mode_trait() const { return ModeTrait::from_mode(m_param.mode); }
  41. /**
  42. * \param[in] src input tensor
  43. * \param[out] dst output tensor
  44. *
  45. * src and dst should have the same shape;
  46. * layouts should be contiguous;
  47. * the underlying data pointer can point to the same memory region for
  48. * src and dst.
  49. */
  50. virtual void exec(_megdnn_in const TensorNDArray& src, _megdnn_tensor_out dst) = 0;
  51. //! deduce output shape (do not check whether arity matches)
  52. MGE_WIN_DECLSPEC_FUC static void deduce_shape(
  53. const TensorShapeArray& src, TensorShape& dst);
  54. MGE_WIN_DECLSPEC_FUC static void deduce_format(
  55. const TensorFormatArray& src, TensorFormat& dst);
  56. //! deduce output layout
  57. MGE_WIN_DECLSPEC_FUC void deduce_layout(
  58. const TensorLayoutArray& src, TensorLayout& dst);
  59. protected:
  60. //! throw exception if incorrect layout; broadcast input shape to
  61. //! output shape
  62. MGE_WIN_DECLSPEC_FUC void check_layout_and_broadcast(
  63. const TensorLayoutPtrArray& src, const TensorLayout& dst);
  64. private:
  65. void check_dtype(DType dtype);
  66. };
  67. using Elemwise = ElemwiseForward;
  68. /*!
  69. * \brief compute ``x**a`` where ``a`` is a constant from the Param
  70. *
  71. * This opr is usually not directly accessible by the end user and it is created
  72. * by mgb optimizer, aiming to work around numerical stability issues with pow.
  73. * For example ``powf(x, 2.f)`` with ``x < 0`` in fast math mode may return NaN.
  74. *
  75. * Like elemwise, this opr supports arbitrary strides. But it should only be
  76. * used with monotone strides. Input and output should have the same
  77. * float-category dtype.
  78. */
  79. class PowC : public OperatorBase {
  80. DEF_OPR_PARAM(PowC);
  81. DEF_OPR_IMPL(PowC, OperatorBase, 1, 1);
  82. public:
  83. void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst);
  84. //! compatible API for mgb; workspace is not used
  85. void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
  86. return exec(src, dst);
  87. }
  88. size_t get_workspace_in_bytes(const TensorLayout&, const TensorLayout&) {
  89. // the impls should require no workspace; this can be later changed to a
  90. // virtual function if this situation changes
  91. return 0;
  92. }
  93. void deduce_layout(const TensorLayout& src, TensorLayout& dst) {
  94. dst.dtype = src.dtype;
  95. dst.init_contiguous_stride(src);
  96. }
  97. protected:
  98. /*!
  99. * Perform the computing where layouts have been verified.
  100. *
  101. * \p src can have arbitrary layout, and \p dst is contiguous. They have the
  102. * same shape and dtype.
  103. *
  104. * The implementation should not access param(). It should check \p exp_f
  105. * and \p exp_i for the exponent value. Exactly one of them would be
  106. * non-null.
  107. *
  108. * Note: \p exp_f and \p exp_i must be dereferenced before dispatching any
  109. * kernel. They are allocated on the caller's stack.
  110. */
  111. virtual void do_exec(
  112. _megdnn_tensor_in src, _megdnn_tensor_out dst, const float* exp_f,
  113. const int* exp_i) = 0;
  114. };
  115. /*!
  116. * \brief modify a tensor inplace by adding another tensor to it
  117. *
  118. * dst and delta can have arbitrary layout but must have the same shape.
  119. */
  120. class AddUpdateForward : public OperatorBase {
  121. DEF_OPR_PARAM(AddUpdate);
  122. DEF_OPR_IMPL(AddUpdateForward, OperatorBase, -1, 1);
  123. public:
  124. virtual void exec(_megdnn_tensor_inout dst, _megdnn_tensor_in delta) = 0;
  125. protected:
  126. void check_exec(const TensorLayout& dst, const TensorLayout& delta);
  127. };
  128. using AddUpdate = AddUpdateForward;
  129. class ReduceForward : public OperatorBase {
  130. DEF_OPR_PARAM(Reduce);
  131. DEF_OPR_IMPL(ReduceForward, OperatorBase, 1, 1);
  132. public:
  133. using Mode = Param::Mode;
  134. using DataType = Param::DataType;
  135. /**
  136. * \param[in] src input tensor
  137. * \param[out] dst output tensor
  138. *
  139. * src and dst should be contiguous.
  140. * src and dst should be of the same shape for all dimensions except
  141. * param().axis.
  142. * the param().axis-th dimension shape for dst should be one.
  143. */
  144. virtual void exec(
  145. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  146. _megdnn_workspace workspace) = 0;
  147. MGE_WIN_DECLSPEC_FUC void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  148. virtual size_t get_workspace_in_bytes(
  149. const TensorLayout& src, const TensorLayout& dst) = 0;
  150. protected:
  151. void check_exec(
  152. const TensorLayout& src, const TensorLayout& dst,
  153. size_t workspace_in_bytes);
  154. };
  155. using Reduce = ReduceForward;
  156. class CorrelationBase : public OperatorBase {
  157. DEF_OPR_IMPL_CTOR(CorrelationBase, OperatorBase);
  158. DEF_OPR_PARAM(Correlation);
  159. protected:
  160. void deduce_layout_fwd(
  161. const TensorLayout& data1, const TensorLayout& data2, TensorLayout& dst);
  162. void check_layout_fwd(
  163. const TensorLayout& data1, const TensorLayout& data2,
  164. const TensorLayout& dst);
  165. };
  166. class CorrelationForward : public CorrelationBase {
  167. DEF_OPR_IMPL(CorrelationForward, CorrelationBase, 2, 1);
  168. public:
  169. /**
  170. * \param[in] data1 (n, c, ih, iw)
  171. * \param[in] data2 (n, c, ih, iw)
  172. * \param[out] dst (n, q, oh, ow), q is the number of neighborhood
  173. * */
  174. virtual void exec(
  175. _megdnn_tensor_in data1, _megdnn_tensor_in data2, _megdnn_tensor_out dst,
  176. _megdnn_workspace workspace) = 0;
  177. void deduce_layout(
  178. const TensorLayout& data1, const TensorLayout& data2, TensorLayout& dst);
  179. virtual size_t get_workspace_in_bytes(
  180. const TensorLayout& data1, const TensorLayout& data2,
  181. const TensorLayout& dst) = 0;
  182. protected:
  183. void check_exec(
  184. const TensorLayout& data1, const TensorLayout& data2,
  185. const TensorLayout& dst, size_t workspace_in_bytes);
  186. };
  187. using Correlation = CorrelationForward;
  188. class CorrelationBackwardData1 : public CorrelationBase {
  189. DEF_OPR_IMPL(CorrelationBackwardData1, CorrelationBase, 3, 1);
  190. public:
  191. /**
  192. * \param[in] diff the backpropagated gradient wrt. dst
  193. * \param[in] data1 the `data1' parameter in CorrelationForward::exec
  194. * \param[in] data2 the `data2' parameter in CorrelationForward::exec
  195. * \param[out] grad1 the backpropagated gradient wrt. data1
  196. */
  197. virtual void exec(
  198. _megdnn_tensor_in diff, _megdnn_tensor_in data1, _megdnn_tensor_in data2,
  199. _megdnn_tensor_out grad1, _megdnn_workspace workspace) = 0;
  200. void deduce_layout(
  201. const TensorLayout& diff1, const TensorLayout& data1,
  202. const TensorLayout& data2, TensorLayout& dst);
  203. virtual size_t get_workspace_in_bytes(
  204. const TensorLayout& diff, const TensorLayout& data1,
  205. const TensorLayout& data2, const TensorLayout& grad1) = 0;
  206. protected:
  207. void check_exec(
  208. const TensorLayout& diff, const TensorLayout& data1,
  209. const TensorLayout& data2, const TensorLayout& grad1,
  210. size_t workspace_in_bytes);
  211. };
  212. class CorrelationBackwardData2 : public CorrelationBase {
  213. DEF_OPR_IMPL(CorrelationBackwardData2, CorrelationBase, 3, 1);
  214. public:
  215. /**
  216. * \param[in] diff the backpropagated gradient wrt. dst
  217. * \param[in] data1 the `data1' parameter in CorrelationForward::exec
  218. * \param[in] data2 the `data2' parameter in CorrelationForward::exec
  219. * \param[out] grad2 the backpropagated gradient wrt. data2
  220. */
  221. virtual void exec(
  222. _megdnn_tensor_in diff, _megdnn_tensor_in data1, _megdnn_tensor_in data2,
  223. _megdnn_tensor_out grad2, _megdnn_workspace workspace) = 0;
  224. void deduce_layout(
  225. const TensorLayout& diff1, const TensorLayout& data1,
  226. const TensorLayout& data2, TensorLayout& dst);
  227. virtual size_t get_workspace_in_bytes(
  228. const TensorLayout& diff, const TensorLayout& data1,
  229. const TensorLayout& data2, const TensorLayout& grad2) = 0;
  230. protected:
  231. void check_exec(
  232. const TensorLayout& diff, const TensorLayout& data1,
  233. const TensorLayout& data2, const TensorLayout& grad2,
  234. size_t workspace_in_bytes);
  235. };
  236. class CumsumForward : public OperatorBase {
  237. DEF_OPR_PARAM(Cumsum);
  238. DEF_OPR_IMPL(CumsumForward, OperatorBase, 1, 1);
  239. public:
  240. /**
  241. * \param[in] src input tensor
  242. * \param[out] dst output tensor
  243. *
  244. * src and dst should be contiguous.
  245. * src and dst should have the same shape.
  246. *
  247. * The exclusive flag specifies whether the current element it taken
  248. * into account when calculating results.
  249. *
  250. * The reverse flag specifies whether cumsum is forward (
  251. * from 0 to n) or backward (from n downto 0).
  252. *
  253. * Example:
  254. * exclusive && reverse:
  255. * dst_i = src_{i+1} + src_{i+2} + ... + src_{n-1}
  256. * exclusive && !reverse
  257. * dst_i = src_0 + src_1 + ... + src_{i-1}
  258. * !exclusive && reverse:
  259. * dst_i = src_i + src_{i+1} + ... + src_{n-1}
  260. * !exclusive && !reverse:
  261. * dst_i = src_0 + src_1 + ... + src_i
  262. */
  263. virtual void exec(
  264. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  265. _megdnn_workspace workspace) = 0;
  266. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  267. virtual size_t get_workspace_in_bytes(
  268. const TensorLayout& src, const TensorLayout& dst) = 0;
  269. protected:
  270. void check_exec(
  271. const TensorLayout& src, const TensorLayout& dst,
  272. size_t workspace_in_bytes);
  273. };
  274. using Cumsum = CumsumForward;
  275. // mxx can be max or min
  276. class ArgmxxBase : public OperatorBase {
  277. DEF_OPR_IMPL_CTOR(ArgmxxBase, OperatorBase);
  278. DEF_OPR_PARAM(Axis);
  279. protected:
  280. void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);
  281. };
  282. class ArgmaxForward : public ArgmxxBase {
  283. DEF_OPR_IMPL(ArgmaxForward, ArgmxxBase, 1, 1);
  284. public:
  285. /**
  286. * \param[in] src input tensor
  287. * \param[out] dst output tensor containing the argmax indices
  288. *
  289. * src and dst should be contiguous.
  290. * src and dst should be of the same shape for all dimensions except
  291. * param().axis.
  292. * the param().axis-th dimension shape for dst should be one.
  293. */
  294. virtual void exec(
  295. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  296. _megdnn_workspace workspace) = 0;
  297. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  298. virtual size_t get_workspace_in_bytes(
  299. const TensorLayout& src, const TensorLayout& dst) = 0;
  300. protected:
  301. void check_exec(
  302. const TensorLayout& src, const TensorLayout& dst,
  303. size_t workspace_in_bytes);
  304. };
  305. using Argmax = ArgmaxForward;
  306. class ArgminForward : public ArgmxxBase {
  307. DEF_OPR_IMPL(ArgminForward, ArgmxxBase, 1, 1);
  308. public:
  309. /**
  310. * \param[in] src input tensor
  311. * \param[out] dst output tensor containing the argmax indices
  312. *
  313. * src and dst should be contiguous.
  314. * src and dst should be of the same shape for all dimensions except
  315. * param().axis.
  316. * the param().axis-th dimension shape for dst should be one.
  317. */
  318. virtual void exec(
  319. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  320. _megdnn_workspace workspace) = 0;
  321. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  322. virtual size_t get_workspace_in_bytes(
  323. const TensorLayout& src, const TensorLayout& dst) = 0;
  324. protected:
  325. void check_exec(
  326. const TensorLayout& src, const TensorLayout& dst,
  327. size_t workspace_in_bytes);
  328. };
  329. using Argmin = ArgminForward;
  330. /*!
  331. * \brief take values from input according to given condition
  332. *
  333. * Output two tensors:
  334. * 1. values copied from *data*, with same dtype as *data*
  335. * 2. selected indices with dtype int32; note that it is 1-dimensional and
  336. * based on the flatten input.
  337. *
  338. * Require data and mask to have the same shape and both be contiguous.
  339. */
  340. class CondTake : public OperatorBase {
  341. DEF_OPR_IMPL(CondTake, OperatorBase, 2, 2);
  342. DEF_OPR_PARAM(CondTake);
  343. public:
  344. using Output = std::array<TensorND, 2>;
  345. using OutputDType = std::array<DType, 2>;
  346. OutputDType infer_dtype(DType data, DType mask);
  347. virtual size_t get_workspace_in_bytes(const TensorLayout& data) = 0;
  348. virtual Output exec(
  349. _megdnn_tensor_in data, _megdnn_tensor_in mask, _megdnn_workspace workspace,
  350. DynOutMallocPolicyCall malloc_policy) = 0;
  351. protected:
  352. //! check input layouts and get flattened size
  353. size_t check_exec_get_size(
  354. const TensorLayout& data, const TensorLayout& mask,
  355. size_t workspace_in_bytes);
  356. };
  357. class TransposeForward : public OperatorBase {
  358. DEF_OPR_IMPL(TransposeForward, OperatorBase, 1, 1);
  359. DEF_OPR_PARAM(Empty);
  360. public:
  361. /**
  362. * \param[in] src (m, n) stride[0] >= n && stride[1] == 1
  363. * \param[out] dst (n, m) stride[0] >= m && stride[1] == 1
  364. */
  365. virtual void exec(
  366. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  367. _megdnn_workspace workspace) = 0;
  368. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  369. virtual size_t get_workspace_in_bytes(
  370. const TensorLayout& src, const TensorLayout& dst) = 0;
  371. protected:
  372. void check_exec(
  373. const TensorLayout& src, const TensorLayout& dst,
  374. size_t workspace_in_bytes);
  375. };
  376. using Transpose = TransposeForward;
  377. /**
  378. * Change a tensor to another layout that has the same dtype and total number of
  379. * elements, and non-overlapping stride.
  380. *
  381. * ON CPU:
  382. * This operator is optimized for some cases(e.g. both dst and last dim of src
  383. * are contiguous)
  384. *
  385. * ON CUDA:
  386. * More contiguous the input/output layouts, higher performance. There is also
  387. * special optimization for broadcast case.
  388. */
  389. class RelayoutForward : public OperatorBase {
  390. DEF_OPR_IMPL(RelayoutForward, OperatorBase, 1, 1);
  391. DEF_OPR_PARAM(Empty);
  392. public:
  393. /*!
  394. * \brief execute relayout opr
  395. *
  396. * This operator should be placed on the same computing device of *dst*.
  397. *
  398. * \param src_handle handle of input tensor; for CUDA d2d copy, the
  399. * src handle can be on a different GPU for copy tensor with
  400. * non-contig dims <= 2
  401. */
  402. virtual void exec(
  403. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  404. Handle* src_handle = nullptr) = 0;
  405. protected:
  406. //! check layout and collapse contiguous
  407. void check_layout_and_canonize(TensorLayout& src, TensorLayout& dst);
  408. };
  409. using Relayout = RelayoutForward;
  410. /**
  411. * \brief Base class for Concat and Split operators
  412. */
  413. class ConcatSplitBase : public OperatorBase {
  414. public:
  415. using Param = param::Axis;
  416. ConcatSplitBase(Handle* handle);
  417. const Param& param() const { return m_param; }
  418. Param& param() { return m_param; }
  419. protected:
  420. void check_layout_common(const TensorLayoutArray& srcs, const TensorLayout& dst);
  421. Param m_param;
  422. /**
  423. * \brief a helper function
  424. *
  425. * A = shape[0] * shape[1] * ... * shape[axis-1]
  426. * B = {srcs[0].shape[axis], srcs[1].shape[axis], ...}
  427. * C = shape[axis+1] * shape[axis+2] * ... * shape[ndim-1]
  428. */
  429. void get_ABC(const TensorShapeArray& srcs, size_t& A, size_t* B, size_t& C);
  430. thin_function<TensorLayout(const TensorND& tensor)> m_get_layout;
  431. thin_function<TensorShape(const TensorLayout& layout)> m_get_shape;
  432. };
  433. class ConcatForward : public ConcatSplitBase {
  434. DEF_OPR_IMPL(ConcatForward, ConcatSplitBase, 1, 1);
  435. public:
  436. /**
  437. * \param[in] srcs a vector containing all inputs to be concatenated
  438. * \param[out] dst the output tensor.
  439. *
  440. * All tensors in srcs and dst should be contiguous.
  441. * All tensors should have the same shape for all axes except
  442. * param().axis.
  443. * For the param().axis-th axis, the axis shape for dst should be the
  444. * sum of corresponding axis shapes for all srcs.
  445. */
  446. virtual void exec(
  447. _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst,
  448. _megdnn_workspace workspace) = 0;
  449. void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst);
  450. virtual size_t get_workspace_in_bytes(
  451. const TensorLayoutArray& srcs, const TensorLayout& dst) = 0;
  452. protected:
  453. void check_exec(
  454. const TensorLayoutArray& srcs, const TensorLayout& dst,
  455. size_t workspace_in_bytes);
  456. };
  457. using Concat = ConcatForward;
  458. class SplitForward : public ConcatSplitBase {
  459. DEF_OPR_IMPL(SplitForward, ConcatSplitBase, 1, 1);
  460. public:
  461. /**
  462. * \param[in] src input tensor
  463. * \param[out] dsts a vector containing all splitted result
  464. *
  465. * All tensors in src and dsts should be contiguous.
  466. * All tensors should have the same shape for all axes except
  467. * param().axis.
  468. * For the param().axis-th axis, the axis shape for src should be the
  469. * sum of corresponding axis shapes for all dsts.
  470. */
  471. virtual void exec(
  472. _megdnn_tensor_in src, const TensorNDArray& dsts,
  473. _megdnn_workspace workspace) = 0;
  474. virtual size_t get_workspace_in_bytes(
  475. const TensorLayout& src, const TensorLayoutArray& dsts) = 0;
  476. protected:
  477. void check_exec(
  478. const TensorLayout& src, const TensorLayoutArray& dsts,
  479. size_t workspace_in_bytes);
  480. };
  481. using Split = SplitForward;
  482. /**
  483. * \brief Base class for ParamPackConcat and ParamPackSplit Operators.
  484. *
  485. * ParamPack oprs act like Concat and Split, but they also are optimized for a
  486. * large number of inputs and can handle alignment requirements. Axis is also
  487. * not supported.
  488. *
  489. * The offsets can be generated by gen_offsets().
  490. */
  491. class ParamPackConcatSplitBase : public OperatorBase {
  492. protected:
  493. void check_exec(
  494. const TensorLayout& concated, const TensorLayout& offsets,
  495. const TensorLayout& parts);
  496. public:
  497. using Param = megdnn::param::Empty;
  498. ParamPackConcatSplitBase(Handle* handle) : OperatorBase(handle) {}
  499. //! generate offsets to be used with ParamPackConcat and ParamPackSplit
  500. MGE_WIN_DECLSPEC_FUC static std::vector<dt_int32> gen_offsets(
  501. const TensorShapeArray& shapes, size_t alignment, size_t dtype_size);
  502. };
  503. /**
  504. * \brief ParamPackConcat, used for calculating gradient of ParamPackSplit
  505. * Combine multiple gradient tensors into a single large tensor, use copy
  506. * strategy due to AddUpdate or other dynamic situation.
  507. */
  508. class ParamPackConcat : public ParamPackConcatSplitBase {
  509. DEF_OPR_IMPL(ParamPackConcat, ParamPackConcatSplitBase, 2, 1);
  510. public:
  511. /*
  512. * \param[in] srcs: TensorND on cpu. srcs[i] corresponding to the
  513. * address of i-th Tensor.
  514. * \param[in] offsets: with size `2 * srcs.shape[0]`.
  515. * offsets[i * 2] and offsets[i * 2 + 1] means
  516. * the begin and the end of srcs[i]'s offsets in dst
  517. * \param[out] dst: output TensorND, live on cpu or gpu
  518. */
  519. virtual void exec(
  520. _megdnn_tensor_in srcs, _megdnn_tensor_in offsets, _megdnn_tensor_out dst,
  521. _megdnn_workspace workspace) = 0;
  522. virtual size_t get_workspace_in_bytes(
  523. const TensorShapeArray& srcs, const TensorShape& offsets,
  524. const TensorShape& dst) = 0;
  525. };
  526. /**
  527. * \brief base class for Tile and Repeat
  528. */
  529. class TileRepeatBase : public OperatorBase {
  530. public:
  531. TileRepeatBase(Handle* handle) : OperatorBase(handle) {}
  532. struct Param {
  533. TensorShape times;
  534. };
  535. Param& param() { return m_param; }
  536. const Param& param() const { return m_param; }
  537. protected:
  538. void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);
  539. void deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst);
  540. /**
  541. * Assuming src/dst/times are already simplified on entrance.
  542. */
  543. size_t get_workspace_in_bytes_fwd(
  544. const TensorShape& src, const TensorShape& dst, const TensorShape& times,
  545. DType dtype);
  546. Param m_param;
  547. };
  548. class TileBase : public TileRepeatBase {
  549. public:
  550. TileBase(Handle* handle) : TileRepeatBase(handle) {}
  551. protected:
  552. void simplify_shape(
  553. const TensorShape& src, const TensorShape& dst, const TensorShape& times,
  554. TensorShape& src2, TensorShape& dst2, TensorShape& times2);
  555. /**
  556. * This is a helper function that would facilitate other backends'
  557. * implementation.
  558. */
  559. size_t get_workspace_in_bytes_fwd(const TensorLayout& src, const TensorLayout& dst);
  560. };
  561. class TileForward : public TileBase {
  562. DEF_OPR_IMPL(TileForward, TileBase, 1, 1);
  563. public:
  564. /**
  565. * \brief Tile src times to get dst.
  566. * \param[in] src input tensor
  567. * \param[out] dst output tensor
  568. * \param[out] workspace temporary workspace
  569. *
  570. * src and dst must be contiguous.
  571. * dst.shape should be {src.shape[0]*param().times[0],
  572. * src.shape[1]*param().times[1], ...}
  573. *
  574. * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.tile.html
  575. *
  576. * Difference between Tile and Repeat:
  577. * Tiling `abc' twice yields `abcabc', whereas repeating `abc' twice
  578. * yields `aabbcc'.
  579. */
  580. virtual void exec(
  581. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  582. _megdnn_workspace workspace) = 0;
  583. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  584. virtual size_t get_workspace_in_bytes(
  585. const TensorLayout& src, const TensorLayout& dst) = 0;
  586. protected:
  587. void check_exec(
  588. const TensorLayout& src, const TensorLayout& dst,
  589. size_t workspace_in_bytes);
  590. };
  591. using Tile = TileForward;
  592. class TileBackward : public TileBase {
  593. DEF_OPR_IMPL(TileBackward, TileBase, 1, 1);
  594. public:
  595. /**
  596. * \param[in] diff the backpropagated gradient wrt. dst
  597. * \param[out] grad the backpropagated gradient wrt. src
  598. * \param[out] workspace temporary workspace
  599. */
  600. virtual void exec(
  601. _megdnn_tensor_in diff, _megdnn_tensor_out grad,
  602. _megdnn_workspace workspace) = 0;
  603. virtual size_t get_workspace_in_bytes(
  604. const TensorLayout& diff, const TensorLayout& grad) = 0;
  605. protected:
  606. void check_exec(
  607. const TensorLayout& diff, const TensorLayout& grad,
  608. size_t workspace_in_bytes);
  609. };
  610. class RepeatBase : public TileRepeatBase {
  611. public:
  612. RepeatBase(Handle* handle) : TileRepeatBase(handle) {}
  613. protected:
  614. void simplify_shape(
  615. const TensorShape& src, const TensorShape& dst, const TensorShape& times,
  616. TensorShape& src2, TensorShape& dst2, TensorShape& times2);
  617. /**
  618. * This is a helper function that would facilitate other backends'
  619. * implementation.
  620. */
  621. size_t get_workspace_in_bytes_fwd(const TensorLayout& src, const TensorLayout& dst);
  622. };
  623. class RepeatForward : public RepeatBase {
  624. DEF_OPR_IMPL(RepeatForward, RepeatBase, 1, 1);
  625. public:
  626. /**
  627. * \brief Repeat src times to get dst.
  628. * \param[in] src input tensor
  629. * \param[out] dst output tensor
  630. * \param[out] workspace temporary workspace
  631. *
  632. * src and dst must be contiguous.
  633. * dst.shape should be {src.shape[0]*param().times[0],
  634. * src.shape[1]*param().times[1], ...}
  635. *
  636. * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.repeat.html
  637. * \see TileForward
  638. */
  639. virtual void exec(
  640. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  641. _megdnn_workspace workspace) = 0;
  642. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  643. virtual size_t get_workspace_in_bytes(
  644. const TensorLayout& src, const TensorLayout& dst) = 0;
  645. protected:
  646. void check_exec(
  647. const TensorLayout& src, const TensorLayout& dst,
  648. size_t workspace_in_bytes);
  649. };
  650. using Repeat = RepeatForward;
  651. class RepeatBackward : public RepeatBase {
  652. DEF_OPR_IMPL(RepeatBackward, RepeatBase, 1, 1);
  653. public:
  654. /**
  655. * \param[in] diff the backpropagated gradient wrt. dst
  656. * \param[out] grad the backpropagated gradient wrt. src
  657. * \param[out] workspace temporary workspace
  658. */
  659. virtual void exec(
  660. _megdnn_tensor_in diff, _megdnn_tensor_out grad,
  661. _megdnn_workspace workspace) = 0;
  662. virtual size_t get_workspace_in_bytes(
  663. const TensorLayout& diff, const TensorLayout& grad) = 0;
  664. protected:
  665. void check_exec(
  666. const TensorLayout& diff, const TensorLayout& grad,
  667. size_t workspace_in_bytes);
  668. };
  669. class ArgsortForward : public OperatorBase {
  670. DEF_OPR_IMPL(ArgsortForward, OperatorBase, 1, 2);
  671. DEF_OPR_PARAM(Argsort);
  672. public:
  673. using Order = Param::Order;
  674. /**
  675. * \param[in] src (m, n)
  676. * \param[out] dst (m, n)
  677. * \param[out] indices (m, n)
  678. *
  679. * src, dst and indices should be contiguous.
  680. * Performing m independent sorting on m arrays of length n.
  681. * Sorting arrays and storing the resulting array in `dst',
  682. * and the corresponding indices in `indices'.
  683. *
  684. * Indices range from 0 to n-1.
  685. *
  686. * Note that indices is a TensorND of type int.
  687. */
  688. virtual void exec(
  689. _megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_tensor_out indices,
  690. _megdnn_workspace workspace) = 0;
  691. void deduce_layout(
  692. const TensorLayout& src, TensorLayout& dst, TensorLayout& indices);
  693. virtual size_t get_workspace_in_bytes(
  694. const TensorLayout& src, const TensorLayout& dst,
  695. const TensorLayout& indices) = 0;
  696. protected:
  697. void check_exec(
  698. const TensorLayout& src, const TensorLayout& dst,
  699. const TensorLayout& indices, size_t workspace_in_bytes);
  700. };
  701. using Argsort = ArgsortForward;
  702. /*!
  703. * \brief backward opr for Argsort
  704. *
  705. * Note: the name is kept for backward compatibility. This opr is actually a
  706. * batched value setter. It is used for gradient computing of Argsort and TopK.
  707. */
  708. class ArgsortBackward : public OperatorBase {
  709. DEF_OPR_IMPL(ArgsortBackward, OperatorBase, 2, 1);
  710. DEF_OPR_PARAM(Empty);
  711. public:
  712. /**
  713. * \param[in] diff (m, k) the backpropagated gradient wrt. dst
  714. * \param[in] indices (m, k) the `indices' parameter in
  715. * ArgsortForward::exec
  716. * \param[out] grad (m, n) the backpropagated gradient wrt. src
  717. *
  718. * Constraint: n >= k. Untouched values would be initialized as zero.
  719. */
  720. virtual void exec(
  721. _megdnn_tensor_in diff, _megdnn_tensor_in indices, _megdnn_tensor_out grad,
  722. _megdnn_workspace workspace) = 0;
  723. virtual size_t get_workspace_in_bytes(
  724. const TensorLayout& diff, const TensorLayout& indices,
  725. const TensorLayout& grad) = 0;
  726. protected:
  727. void check_exec(
  728. const TensorLayout& diff, const TensorLayout& indices,
  729. const TensorLayout& grad, size_t workspace_in_bytes);
  730. };
  731. class TopK : public OperatorBase {
  732. DEF_OPR_IMPL(TopK, OperatorBase, 1, 2);
  733. DEF_OPR_PARAM(TopK);
  734. protected:
  735. //! impl exec; inputs have been validated
  736. virtual void do_exec(
  737. int k, _megdnn_tensor_in data, _megdnn_tensor_out values, int32_t* indices,
  738. _megdnn_workspace workspace) = 0;
  739. public:
  740. /*!
  741. * \param[in] k if positive, compute the smallest top-k values; otherwise
  742. * compute the largest top-k values
  743. * \param[in] data (m, n) input data, where top-k is computed on the
  744. * second axis. The second dimension must be contiguous, and the first
  745. * dimension can have arbitrary stride.
  746. * \param[out] values (m, ) or (m, k) output values; its shape depends
  747. * on mode
  748. * \param[out] indices () or (m, ) or (m, k) output values; its shape
  749. * depends on mode
  750. */
  751. void exec(
  752. int k, _megdnn_tensor_in data, _megdnn_tensor_out values,
  753. _megdnn_tensor_out indices, _megdnn_workspace workspace);
  754. virtual size_t get_workspace_in_bytes(
  755. int k, const TensorLayout& data, const TensorLayout& values,
  756. const TensorLayout& indices) = 0;
  757. void deduce_layout(
  758. int k, const TensorLayout& data, TensorLayout& values,
  759. TensorLayout& indices);
  760. };
  761. /*!
  762. * \brief convert dtype of *src* to match dtype of *dst*; *src* may have
  763. * arbitrary layout and *dst* must be contiguous.
  764. */
  765. class TypeCvtForward : public OperatorBase {
  766. DEF_OPR_PARAM(Empty);
  767. DEF_OPR_IMPL(TypeCvtForward, OperatorBase, 1, 1);
  768. public:
  769. virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) = 0;
  770. protected:
  771. void check_exec(const TensorLayout& src, const TensorLayout& dst);
  772. };
  773. using TypeCvt = TypeCvtForward;
  774. class IndexingRemapBase : public OperatorBase {
  775. public:
  776. using Param = param::IndexingRemap;
  777. IndexingRemapBase(Handle* handle) : OperatorBase(handle) {}
  778. Param& param() { return m_param; }
  779. const Param& param() const { return m_param; }
  780. protected:
  781. Param m_param;
  782. void check_layout_fwd(
  783. const TensorLayout& src, const TensorLayout& map, const TensorLayout& dst);
  784. };
  785. class IndexingRemapForward : public IndexingRemapBase {
  786. DEF_OPR_IMPL(IndexingRemapForward, IndexingRemapBase, 2, 1);
  787. public:
  788. /**
  789. * \param[in] src input tensor
  790. * \param[in] map input map
  791. * \param[out] dst output tensor
  792. *
  793. * Suppose:
  794. * the shape of src is \f$(s_0, s_1, ..., s_{m-1}\f$;
  795. * the shape of dst is \f$(d_0, d_1, ..., d_{n-1})\f$;
  796. * then:
  797. * the shape of map must be \f$(d_0, d_1, ..., d_{n-1}, m)\f$.
  798. *
  799. * The last dimension of map indicates the src indices for the
  800. * corresponding dst entry.
  801. *
  802. * src and dst can be non-contiguous in a non-overlapping manner.
  803. */
  804. virtual void exec(
  805. _megdnn_tensor_in src, _megdnn_tensor_in map, _megdnn_tensor_out dst,
  806. _megdnn_workspace workspace) = 0;
  807. void deduce_layout(
  808. const TensorLayout& src, const TensorLayout& map, TensorLayout& dst);
  809. virtual size_t get_workspace_in_bytes(
  810. const TensorLayout& src, const TensorLayout& map,
  811. const TensorLayout& dst) = 0;
  812. protected:
  813. void check_exec(
  814. const TensorLayout& src, const TensorLayout& map, const TensorLayout& dst,
  815. size_t workspace_in_bytes);
  816. };
  817. using IndexingRemap = IndexingRemapForward;
  818. // The using directives preserve backward compatibility.
  819. using TensorRemapForward = IndexingRemap;
  820. using TensorRemap = TensorRemapForward;
  821. class IndexingRemapBackward : public IndexingRemapBase {
  822. DEF_OPR_IMPL(IndexingRemapBackward, IndexingRemapBase, 2, 1);
  823. public:
  824. /**
  825. * \param[in] diff the backpropagated gradient wrt. dst
  826. * \param[in] map the `map' parameter in IndexingRemapForward::exec
  827. * \param[out] grad the backpropagated gradient wrt. src
  828. */
  829. virtual void exec(
  830. _megdnn_tensor_in diff, _megdnn_tensor_in map, _megdnn_tensor_out grad,
  831. _megdnn_workspace workspace) = 0;
  832. virtual size_t get_workspace_in_bytes(
  833. const TensorLayout& diff, const TensorLayout& map,
  834. const TensorLayout& grad) = 0;
  835. protected:
  836. void check_exec(
  837. const TensorLayout& diff, const TensorLayout& map, const TensorLayout& grad,
  838. size_t workspace_in_bytes);
  839. };
  840. // The using directives preserve backward compatibility.
  841. using TensorRemapBackward = IndexingRemapBackward;
  842. class Linspace : public OperatorBase {
  843. DEF_OPR_IMPL(Linspace, OperatorBase, 0, 1);
  844. DEF_OPR_PARAM(LinspaceFull);
  845. public:
  846. /**
  847. * \param[out] dst must be 1d.
  848. *
  849. * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.linspace.html
  850. */
  851. virtual void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) = 0;
  852. virtual size_t get_workspace_in_bytes(const TensorLayout& dst) = 0;
  853. protected:
  854. void check_exec(const TensorLayout& dst, size_t workspace_in_bytes);
  855. };
  856. class Eye : public OperatorBase {
  857. DEF_OPR_IMPL(Eye, OperatorBase, 0, 1);
  858. DEF_OPR_PARAM(Eye);
  859. public:
  860. /**
  861. * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.eye.html
  862. */
  863. virtual void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) = 0;
  864. virtual size_t get_workspace_in_bytes(const TensorLayout& dst) = 0;
  865. protected:
  866. void check_exec(const TensorLayout& dst, size_t workspace_in_bytes);
  867. };
  868. class Diag : public OperatorBase {
  869. DEF_OPR_IMPL(Diag, OperatorBase, 1, 1);
  870. DEF_OPR_PARAM(Diag);
  871. public:
  872. /**
  873. * \see http://docs.scipy.org/doc/numpy/reference/generated/numpy.diag.html
  874. */
  875. virtual void exec(
  876. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  877. _megdnn_workspace workspace) = 0;
  878. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  879. virtual size_t get_workspace_in_bytes(
  880. const TensorLayout& src, const TensorLayout& dst) = 0;
  881. protected:
  882. void check_exec(
  883. const TensorLayout& src, const TensorLayout& dst,
  884. size_t workspace_in_bytes);
  885. };
  886. class IndexingOneHotBase : public OperatorBase {
  887. DEF_OPR_IMPL_CTOR(IndexingOneHotBase, OperatorBase);
  888. DEF_OPR_PARAM(Axis);
  889. protected:
  890. void deduce_layout_fwd(
  891. const TensorLayout& src, const TensorLayout& index, TensorLayout& dst);
  892. void check_layout_fwd(
  893. const TensorLayout& src, const TensorLayout& index,
  894. const TensorLayout& dst);
  895. };
  896. /*!
  897. * \brief Indexing for one-hot encoding
  898. *
  899. * Given src, axis and index,
  900. * for all valid (n-1)-dimensional subscript tuples i iterating through index:
  901. * dst[i[0], ..., i[axis-1], 0, i[axis], ..., i[n-2]] =
  902. * inp[i[0], ..., i[axis-1], index[i], i[axis], ..., i[n-2]]
  903. *
  904. * \param[in] src n-dimensional input data
  905. * \param[in] index (n-1)-dimensional index, must be int
  906. * \param[out] dst n-dimensional output data
  907. */
  908. class IndexingOneHotForward : public IndexingOneHotBase {
  909. DEF_OPR_IMPL(IndexingOneHotForward, IndexingOneHotBase, 2, 1);
  910. public:
  911. void deduce_layout(
  912. const TensorLayout& src, const TensorLayout& index, TensorLayout& dst) {
  913. deduce_layout_fwd(src, index, dst);
  914. }
  915. virtual void exec(
  916. _megdnn_tensor_in src, _megdnn_tensor_in index, _megdnn_tensor_out dst,
  917. _megdnn_workspace workspace) = 0;
  918. virtual size_t get_workspace_in_bytes(
  919. const TensorLayout& src, const TensorLayout& index,
  920. const TensorLayout& dst) = 0;
  921. protected:
  922. void check_exec(
  923. const TensorLayout& src, const TensorLayout& index, const TensorLayout& dst,
  924. size_t workspace_in_bytes);
  925. };
  926. using IndexingOneHot = IndexingOneHotForward;
  927. /*!
  928. * \brief set-subtensor corresponding to IndexingOneHotForward
  929. *
  930. * \param[in,out] data n-dimensional input and output data, whose sub part
  931. * corresponding to *index* would be replaced by *sub*
  932. * \param[in] index (n-1)-dimensional index, must be int
  933. * \param[in] sub n-dimensional sub tensor to be filled in *data*
  934. */
  935. class IndexingSetOneHotForward : public IndexingOneHotBase {
  936. DEF_OPR_IMPL(IndexingSetOneHotForward, IndexingOneHotBase, -1, 1);
  937. public:
  938. virtual void exec(
  939. _megdnn_tensor_inout data, _megdnn_tensor_in index, _megdnn_tensor_in sub,
  940. _megdnn_workspace workspace) = 0;
  941. virtual size_t get_workspace_in_bytes(
  942. const TensorLayout& data, const TensorLayout& index,
  943. const TensorLayout& sub) = 0;
  944. protected:
  945. void check_exec(
  946. const TensorLayout& data, const TensorLayout& index,
  947. const TensorLayout& sub, size_t workspace_in_bytes);
  948. };
  949. using IndexingSetOneHot = IndexingSetOneHotForward;
  950. /*!
  951. * \brief base class for indexing on multiple axes using vector indices
  952. *
  953. * Note that the indexing axes are required to be sorted in ascending order
  954. */
  955. class IndexingMultiAxisVecBase : public OperatorBase {
  956. DEF_OPR_IMPL_CTOR(IndexingMultiAxisVecBase, OperatorBase);
  957. DEF_OPR_PARAM(Empty);
  958. public:
  959. struct AxisIndexer {
  960. size_t axis;
  961. TensorND vec;
  962. };
  963. struct AxisIndexerLayoutOnly {
  964. size_t axis;
  965. TensorLayout layout;
  966. };
  967. using IndexDesc = std::vector<AxisIndexer>;
  968. using IndexDescLayoutOnly = std::vector<AxisIndexerLayoutOnly>;
  969. /*!
  970. * \brief convert IndexDesc to IndexDescLayoutOnly
  971. */
  972. static IndexDescLayoutOnly extract_index_layout(const IndexDesc& index);
  973. /*!
  974. * \brief get the axes on src that are not used in index
  975. * \param[out] out output buffer; suggested size is
  976. * TensorLayout::MAX_NDIM
  977. * \return number of elements written to *out*
  978. */
  979. static size_t get_nonindex_axes(
  980. size_t src_ndim, const IndexDesc& index, size_t* out);
  981. /*!
  982. * \brief get contiguous-collapsed layout for indexing on value
  983. * \param idx_axis indexer axis on value (i.e. ExecInfo::idx_axis)
  984. * \return a tensor layout and an axis to iterate over *value* and also
  985. * access *data*; stride of layout on that axis would be zero, and
  986. * strides on other axes correspond to the strides in *data*
  987. */
  988. static std::tuple<TensorLayout, size_t, TensorShape> get_value_iter_optimized_layout(
  989. const TensorLayout& data, const TensorLayout& value, const IndexDesc& index,
  990. size_t idx_axis);
  991. //! helper info for kernel implementation
  992. struct ExecInfo {
  993. //! axis in value used by indexer
  994. size_t idx_axis;
  995. ptrdiff_t value_stride;
  996. void* error_tracker;
  997. megcore::AsyncErrorInfo* error_info;
  998. };
  999. protected:
  1000. /*!
  1001. * \return axis on dst used by indexer (i.e. ExecInfo::idx_axis)
  1002. */
  1003. static size_t deduce_layout_fwd(
  1004. const TensorLayout& data, const IndexDescLayoutOnly& index,
  1005. TensorLayout& dst);
  1006. static ExecInfo check_exec_noworkspace(
  1007. const TensorLayout& data, const TensorLayout& value, const IndexDesc& index,
  1008. IndexDescLayoutOnly& index_layout);
  1009. };
  1010. /*!
  1011. * \brief compute indexing result, like numpy advanced indexing
  1012. *
  1013. * src can have arbitrary layout, but dst must be dim1-contig
  1014. */
  1015. class IndexingMultiAxisVec : public IndexingMultiAxisVecBase {
  1016. DEF_OPR_IMPL(IndexingMultiAxisVec, IndexingMultiAxisVecBase, 0, 1);
  1017. public:
  1018. virtual void exec(
  1019. _megdnn_tensor_in src, const IndexDesc& index, _megdnn_tensor_out dst,
  1020. _megdnn_workspace workspace) = 0;
  1021. /*!
  1022. * \brief get workspace size based on output shape and indexing axes
  1023. */
  1024. size_t get_workspace_in_bytes(
  1025. const TensorShape& dst, const size_t* axes, size_t nr_axes,
  1026. size_t idx_ndim);
  1027. static void deduce_layout(
  1028. const TensorLayout& data, const IndexDescLayoutOnly& index,
  1029. TensorLayout& dst) {
  1030. deduce_layout_fwd(data, index, dst);
  1031. }
  1032. protected:
  1033. virtual size_t get_workspace_in_bytes(size_t dst_idx_size) = 0;
  1034. ExecInfo check_exec(
  1035. const TensorLayout& src, const IndexDesc& index, const TensorLayout& dst,
  1036. size_t workspace_in_bytes);
  1037. };
  1038. /*!
  1039. * \brief base class for modifying data by given index
  1040. *
  1041. * data can have arbitrary layout, but value must be dim1-contig
  1042. */
  1043. class IndexingModifyMultiAxisVecBase : public IndexingMultiAxisVecBase {
  1044. DEF_OPR_IMPL_CTOR(IndexingModifyMultiAxisVecBase, IndexingMultiAxisVecBase);
  1045. public:
  1046. virtual void exec(
  1047. _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& index,
  1048. _megdnn_workspace workspace) = 0;
  1049. /*!
  1050. * \brief get workspace size based on shape of value input and indexing
  1051. * axes
  1052. */
  1053. size_t get_workspace_in_bytes(
  1054. const TensorShape& value, const size_t* axes, size_t nr_axes,
  1055. size_t idx_ndim);
  1056. protected:
  1057. ExecInfo check_exec(
  1058. const TensorLayout& data, const TensorLayout& value, const IndexDesc& index,
  1059. size_t workspace_in_bytes);
  1060. virtual size_t get_workspace_in_bytes(size_t value_idx_size) = 0;
  1061. };
  1062. //! set value to indexed locations; index values must be non-overlapping
  1063. class IndexingSetMultiAxisVec : public IndexingModifyMultiAxisVecBase {
  1064. DEF_OPR_IMPL(IndexingSetMultiAxisVec, IndexingModifyMultiAxisVecBase, 0, 0);
  1065. };
  1066. //! add value to indexed locations; index values must be non-overlapping
  1067. class IndexingIncrMultiAxisVec : public IndexingModifyMultiAxisVecBase {
  1068. DEF_OPR_IMPL(IndexingIncrMultiAxisVec, IndexingModifyMultiAxisVecBase, 0, 0);
  1069. };
  1070. class MeshBase : public OperatorBase {
  1071. DEF_OPR_PARAM(Empty);
  1072. DEF_OPR_IMPL_CTOR(MeshBase, OperatorBase);
  1073. public:
  1074. using AxisIndexer = IndexingMultiAxisVecBase::AxisIndexer;
  1075. using IndexDesc = IndexingMultiAxisVecBase::IndexDesc;
  1076. using AxisIndexerLayoutOnly = IndexingMultiAxisVecBase::AxisIndexerLayoutOnly;
  1077. using IndexDescLayoutOnly = IndexingMultiAxisVecBase::IndexDescLayoutOnly;
  1078. size_t get_workspace_in_bytes(const TensorShape&, const size_t*, size_t, size_t) {
  1079. return 0;
  1080. }
  1081. protected:
  1082. virtual void check_exec(
  1083. const TensorLayout& origin, const TensorLayout& indexed,
  1084. const IndexDesc& desc);
  1085. };
  1086. class NormalMeshBase : public MeshBase {
  1087. DEF_OPR_IMPL(NormalMeshBase, MeshBase, 0, 0);
  1088. protected:
  1089. virtual void check_exec(
  1090. const TensorLayout& origin, const TensorLayout& indexed,
  1091. const IndexDesc& desc) override final;
  1092. };
  1093. class NormalMeshModifyBase : public NormalMeshBase {
  1094. DEF_OPR_IMPL_CTOR(NormalMeshModifyBase, NormalMeshBase);
  1095. public:
  1096. virtual void exec(
  1097. _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc,
  1098. _megdnn_workspace workspace) = 0;
  1099. };
  1100. class BatchedMeshBase : public MeshBase {
  1101. DEF_OPR_IMPL_CTOR(BatchedMeshBase, MeshBase);
  1102. protected:
  1103. virtual void check_exec(
  1104. const TensorLayout& origin, const TensorLayout& indexed,
  1105. const IndexDesc& desc) override final;
  1106. };
  1107. class BatchedMeshModifyBase : public BatchedMeshBase {
  1108. DEF_OPR_IMPL_CTOR(BatchedMeshModifyBase, BatchedMeshBase);
  1109. public:
  1110. virtual void exec(
  1111. _megdnn_tensor_inout data, _megdnn_tensor_in value, const IndexDesc& desc,
  1112. _megdnn_workspace workspace) = 0;
  1113. };
  1114. class MeshIndexing : public NormalMeshBase {
  1115. DEF_OPR_IMPL(MeshIndexing, NormalMeshBase, 0, 0);
  1116. public:
  1117. virtual void exec(
  1118. _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out dst,
  1119. _megdnn_workspace workspace) = 0;
  1120. static void deduce_layout(
  1121. const TensorLayout& inp, const IndexDescLayoutOnly& layouts,
  1122. TensorLayout& out_layout);
  1123. };
  1124. class IncrMeshIndexing : public NormalMeshModifyBase {
  1125. DEF_OPR_IMPL(IncrMeshIndexing, NormalMeshModifyBase, 0, 0);
  1126. };
  1127. class SetMeshIndexing : public NormalMeshModifyBase {
  1128. DEF_OPR_IMPL(SetMeshIndexing, NormalMeshModifyBase, 0, 0);
  1129. };
  1130. class BatchedMeshIndexing : public BatchedMeshBase {
  1131. DEF_OPR_IMPL(BatchedMeshIndexing, BatchedMeshBase, 0, 0);
  1132. public:
  1133. virtual void exec(
  1134. _megdnn_tensor_in src, const IndexDesc& desc, _megdnn_tensor_out dst,
  1135. _megdnn_workspace workspace) = 0;
  1136. static void deduce_layout(
  1137. const TensorLayout& inp, const IndexDescLayoutOnly& layouts,
  1138. TensorLayout& out_layout);
  1139. };
  1140. class BatchedIncrMeshIndexing : public BatchedMeshModifyBase {
  1141. DEF_OPR_IMPL(BatchedIncrMeshIndexing, BatchedMeshModifyBase, 0, 0);
  1142. };
  1143. class BatchedSetMeshIndexing : public BatchedMeshModifyBase {
  1144. DEF_OPR_IMPL(BatchedSetMeshIndexing, BatchedMeshModifyBase, 0, 0);
  1145. };
  1146. class RelayoutFormat : public OperatorBase {
  1147. DEF_OPR_PARAM(RelayoutFormat);
  1148. DEF_OPR_IMPL(RelayoutFormat, OperatorBase, 1, 1);
  1149. public:
  1150. virtual void exec(
  1151. _megdnn_tensor_in src, _megdnn_tensor_out dst,
  1152. _megdnn_workspace workspace) = 0;
  1153. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  1154. void deduce_format(TensorFormat src, TensorFormat& dst);
  1155. virtual size_t get_workspace_in_bytes(
  1156. const TensorLayout& src, const TensorLayout& dst) = 0;
  1157. protected:
  1158. void deduce_layout_fwd(const TensorLayout& src, TensorLayout& dst);
  1159. void check_layout_fwd(const TensorLayout& src, const TensorLayout& dst);
  1160. void check_exec(
  1161. const TensorLayout& src, const TensorLayout& dst,
  1162. size_t workspace_in_bytes);
  1163. void deduce_exec_layout(
  1164. const TensorLayout& src, const TensorLayout& dst,
  1165. TensorLayout& exec_workspace, TensorLayout& exec_src,
  1166. TensorLayout& exec_dst);
  1167. };
  1168. /*!
  1169. * \brief check whether input contains inf or nan value.
  1170. */
  1171. class CheckNonFinite : public OperatorBase {
  1172. DEF_OPR_PARAM(CheckNonFinite);
  1173. DEF_OPR_IMPL(CheckNonFinite, OperatorBase, -1, 1);
  1174. size_t m_size = 0;
  1175. public:
  1176. virtual size_t get_workspace_in_bytes(
  1177. const TensorNDArray& srcs, const TensorLayout& dst) = 0;
  1178. void deduce_layout(const TensorLayoutArray& srcs, TensorLayout& dst);
  1179. virtual void exec(
  1180. _megdnn_in const TensorNDArray& srcs, _megdnn_tensor_out dst,
  1181. _megdnn_workspace workspace) = 0;
  1182. protected:
  1183. void check_exec(
  1184. const TensorNDArray& srcs, const TensorND& dst, size_t workspace_in_bytes);
  1185. };
  1186. /*!
  1187. * \brief fill the tensor with a scalar value
  1188. */
  1189. class Fill : public OperatorBase {
  1190. DEF_OPR_PARAM(Fill);
  1191. DEF_OPR_IMPL(Fill, OperatorBase, 0, 1);
  1192. public:
  1193. virtual void exec(_megdnn_tensor_out dst, _megdnn_workspace workspace) = 0;
  1194. virtual size_t get_workspace_in_bytes(const TensorLayout& dst) = 0;
  1195. protected:
  1196. void check_exec(const TensorLayout& dst, size_t workspace_in_bytes);
  1197. };
  1198. /*!
  1199. * \brief standard padding operator
  1200. * Inputs must have the same dtype, and the output tensor shape must greater or equal
  1201. * than input tensor in every dimensions, the extra space will be fulled with m which
  1202. * default to be 0.
  1203. */
  1204. class PaddingBase : public OperatorBase {
  1205. DEF_OPR_PARAM(Padding);
  1206. DEF_OPR_IMPL(PaddingBase, OperatorBase, 1, 1);
  1207. public:
  1208. using Mode = Param::PaddingMode;
  1209. protected:
  1210. SmallVector<size_t> get_offsets();
  1211. void check_exec(const TensorLayout& src, const TensorLayout& dst);
  1212. };
  1213. class PaddingForward : public PaddingBase {
  1214. DEF_OPR_IMPL(PaddingForward, PaddingBase, 1, 1);
  1215. public:
  1216. virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) = 0;
  1217. void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
  1218. return exec(src, dst);
  1219. }
  1220. virtual size_t get_workspace_in_bytes(
  1221. const TensorLayout& src, const TensorLayout& dst) = 0;
  1222. void deduce_layout(const TensorLayout& src, TensorLayout& dst);
  1223. protected:
  1224. void forward_check_exec(const TensorLayout& src, const TensorLayout& dst);
  1225. };
  1226. using Padding = PaddingForward;
  1227. class PaddingBackward : public PaddingBase {
  1228. DEF_OPR_IMPL(PaddingBackward, PaddingBase, 1, 1);
  1229. public:
  1230. virtual void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst) = 0;
  1231. void exec(_megdnn_tensor_in src, _megdnn_tensor_out dst, _megdnn_workspace) {
  1232. return exec(src, dst);
  1233. }
  1234. virtual size_t get_workspace_in_bytes(
  1235. const TensorLayout& src, const TensorLayout& dst) = 0;
  1236. protected:
  1237. void backward_check_exec(const TensorLayout& src, const TensorLayout& dst);
  1238. };
  1239. } // namespace megdnn
  1240. #include "megdnn/internal/opr_header_epilogue.h"
  1241. // vim: syntax=cpp.doxygen