You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

string_ops.h 18 kB

5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*!
  17. * \file string_ops.h
  18. * \brief
  19. */
  20. #ifndef OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
  21. #define OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
  22. #include <sstream>
  23. #include "graph/operator_reg.h"
  24. namespace ge {
  25. /**
  26. *@brief Split elements of input based on delimiter into a SparseTensor . \n
  27. *@par Inputs:
  28. include:
  29. *@li input:1-D. Strings to split.
  30. *@li delimiter:0-D. Delimiter characters (bytes), or empty string . \n
  31. *@par Attributes:
  32. * skip_empty:A bool. If True, skip the empty strings from the result . \n
  33. *@par Outputs:
  34. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  35. *@li values:A vector of strings corresponding to the splited values.
  36. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,
  37. *where the first value is N and the second value is the maximum number of tokens
  38. *in a single input entry . \n
  39. *@see StringSplit()
  40. *@par Third-party framework compatibility
  41. *compatible with StringSplit op of tensorflow
  42. *@par Restrictions:
  43. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  44. */
  45. REG_OP(StringSplit)
  46. .INPUT(input, TensorType({DT_STRING}))
  47. .INPUT(delimiter, TensorType({DT_STRING}))
  48. .OUTPUT(indices, TensorType({DT_INT64}))
  49. .OUTPUT(values, TensorType({DT_STRING}))
  50. .OUTPUT(shape, TensorType({DT_INT64}))
  51. .ATTR(skip_empty, Bool, true)
  52. .OP_END_FACTORY_REG(StringSplit)
  53. /**
  54. *@brief Split elements of source based on sep into a SparseTensor . \n
  55. *@par Inputs:
  56. include:
  57. *@li input:1-D. Strings to split.
  58. *@li sep:0-D string Tensor, the delimiter character . \n
  59. *@par Attributes:
  60. * maxsplit:An int. If maxsplit > 0, limit of the split of the result . \n
  61. *@par Outputs:
  62. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  63. *@li values:A vector of strings corresponding to the splited values.
  64. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,
  65. *where the first value is N and the second value is the maximum number of tokens
  66. *in a single input entry . \n
  67. *@see StringSplitV2()
  68. *@par Third-party framework compatibility
  69. *compatible with StringSplitV2 op of tensorflow
  70. *@par Restrictions:
  71. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  72. */
  73. REG_OP(StringSplitV2)
  74. .INPUT(input, TensorType({DT_STRING}))
  75. .INPUT(sep, TensorType({DT_STRING}))
  76. .OUTPUT(indices, TensorType({DT_INT64}))
  77. .OUTPUT(values, TensorType({DT_STRING}))
  78. .OUTPUT(shape, TensorType({DT_INT64}))
  79. .ATTR(maxsplit, Int, -1)
  80. .OP_END_FACTORY_REG(StringSplitV2)
  81. /**
  82. *@brief Determine the script codes of a given tensor of Unicode integer code points . \n
  83. *@par Inputs:
  84. include:
  85. *x:A Tensor of int32 Unicode code points . \n
  86. *@par Outputs:
  87. *y:A Tensor of int32 script codes corresponding to each input code point . \n
  88. *@attention Constraints:
  89. *This operation converts Unicode code points to script codes corresponding to
  90. *each code point. Script codes correspond to International Components for
  91. *Unicode (ICU) UScriptCode values.
  92. *See http://icu-project.org/apiref/icu4c/uscript_8h.html.
  93. *Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.
  94. *Output shape will match input shape . \n
  95. *@see UnicodeScript()
  96. *@par Third-party framework compatibility
  97. *compatible with UnicodeScript op of tensorflow
  98. *@par Restrictions:
  99. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  100. */
  101. REG_OP(UnicodeScript)
  102. .INPUT(x, TensorType({DT_INT32}))
  103. .OUTPUT(y, TensorType({DT_INT32}))
  104. .OP_END_FACTORY_REG(UnicodeScript)
  105. /**
  106. *@brief Return substrings from Tensor of strings . \n
  107. *@par Inputs:
  108. include:
  109. *@li input:Tensor of strings.
  110. *@li pos:Scalar defining the position of first character in each substring.
  111. *@li len:Scalar defining the number of characters to include in each substring . \n
  112. *@par Outputs:
  113. *output:Tensor of substrings . \n
  114. *@attention Constraints:
  115. *The hash function is deterministic on the content of the string within
  116. *the process and will never change. However, it is not suitable for
  117. *cryptography. This function may be used when CPU time is scarce and
  118. *inputs are trusted or unimportant. There is a risk of adversaries
  119. *constructing inputs that all hash to the same bucket.
  120. *To prevent this problem, use a strong hash function with
  121. *tf.string_to_hash_bucket_strong . \n
  122. *@see Substr()
  123. *@par Third-party framework compatibility
  124. *compatible with Substr op of tensorflow
  125. *@par Restrictions:
  126. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  127. */
  128. REG_OP(Substr)
  129. .INPUT(input, TensorType({DT_STRING}))
  130. .INPUT(pos, TensorType({DT_INT32, DT_INT64}))
  131. .INPUT(len, TensorType({DT_INT32, DT_INT64}))
  132. .OUTPUT(output, TensorType({DT_STRING}))
  133. .OP_END_FACTORY_REG(Substr)
  134. /**
  135. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  136. *@par Inputs:
  137. include:
  138. *string_tensor:The strings to assign a hash bucket . \n
  139. *@par Outputs:
  140. *y:A Tensor of the same shape as the input x . \n
  141. *@attention Constraints:
  142. *The hash function is deterministic on the content of the string within
  143. *the process and will never change. However, it is not suitable for cryptography.
  144. *This function may be used when CPU time is scarce and inputs are trusted or
  145. *unimportant. There is a risk of adversaries constructing inputs that all hash
  146. *to the same bucket. To prevent this problem, use a strong hash function with
  147. *tf.string_to_hash_bucket_strong . \n
  148. *@see StringToHashBucketFast()
  149. *@par Third-party framework compatibility
  150. *compatible with StringToHashBucketFast op of tensorflow
  151. *@par Restrictions:
  152. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  153. */
  154. REG_OP(StringToHashBucketFast)
  155. .INPUT(x, TensorType({DT_STRING}))
  156. .OUTPUT(y, TensorType({DT_INT64}))
  157. .ATTR(num_buckets, Int, 1)
  158. .OP_END_FACTORY_REG(StringToHashBucketFast)
  159. /**
  160. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  161. *@par Inputs:
  162. include:
  163. *x:The strings to assign a hash bucket . \n
  164. *@par Attributes:
  165. *num_buckets:The number of buckets . \n
  166. *@par Outputs:
  167. *y:A Tensor of the same shape as the input x . \n
  168. *@attention Constraints:
  169. *@li A strong hash is important when inputs may be malicious, e.g. URLs with
  170. *additional components. Adversaries could try to make their inputs hash to
  171. *the same bucket for a denial-of-service attack or to skew the results.
  172. *A strong hash can be used to make it difficult to find inputs with a skewed
  173. * hash value distribution over buckets. This requires that the hash function\
  174. *is seeded by a high-entropy (random) "key" unknown to the adversary.
  175. *@li The additional robustness comes at a cost of roughly 4x higher
  176. *compute time than tf.string_to_hash_bucket_fast . \n
  177. *@see StringToHashBucketStrong()
  178. *@par Third-party framework compatibility
  179. *compatible with StringToHashBucketStrong op of tensorflow
  180. *@par Restrictions:
  181. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  182. */
  183. REG_OP(StringToHashBucketStrong)
  184. .INPUT(x, TensorType({DT_STRING}))
  185. .OUTPUT(y, TensorType({DT_INT64}))
  186. .ATTR(num_buckets, Int, 1)
  187. .REQUIRED_ATTR(key, ListInt)
  188. .OP_END_FACTORY_REG(StringToHashBucketStrong)
  189. /**
  190. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  191. *@par Inputs:
  192. include:
  193. *string_tensor:The strings to assign a hash bucket . \n
  194. *@par Attributes:
  195. *num_buckets:The number of buckets . \n
  196. *@par Outputs:
  197. *y:A Tensor of the same shape as the input string_tensor . \n
  198. *@see StringToHashBucket()
  199. *@par Third-party framework compatibility
  200. *compatible with StringToHashBucket op of tensorflow
  201. *@par Restrictions:
  202. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  203. */
  204. REG_OP(StringToHashBucket)
  205. .INPUT(string_tensor, TensorType({DT_STRING}))
  206. .OUTPUT(y, TensorType({DT_INT64}))
  207. .ATTR(num_buckets, Int, 1)
  208. .OP_END_FACTORY_REG(StringToHashBucket)
  209. /**
  210. *@brief Strip leading and trailing whitespaces from the Tensor . \n
  211. *@par Inputs:
  212. include:
  213. *x:A string Tensor of any shape . \n
  214. *@par Outputs:
  215. *y:A string Tensor of the same shape as the input . \n
  216. *@see StringStrip()
  217. *@par Third-party framework compatibility
  218. *compatible with StringStrip op of tensorflow
  219. *@par Restrictions:
  220. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  221. */
  222. REG_OP(StringStrip)
  223. .INPUT(x, TensorType({DT_STRING}))
  224. .OUTPUT(y, TensorType({DT_STRING}))
  225. .OP_END_FACTORY_REG(StringStrip)
  226. /**
  227. *@brief Computes the length of each string given in the input tensor . \n
  228. *@par Inputs:
  229. include:
  230. *x:The string for which to compute the length . \n
  231. *@par Attributes:
  232. *unit:The unit that is counted to compute string length.
  233. *One of: "BYTE" (for the number of bytes in each string) or
  234. *"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).
  235. *Results are undefined if unit=UTF8_CHAR and the input strings do not contain
  236. *structurally valid UTF-8 . \n
  237. *@par Outputs:
  238. *y:Integer tensor that has the same shape as input.
  239. *The output contains the element-wise string lengths of input . \n
  240. *@see StringLength()
  241. *@par Third-party framework compatibility
  242. *compatible with StringLength op of tensorflow
  243. *@par Restrictions:
  244. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  245. */
  246. REG_OP(StringLength)
  247. .INPUT(x, TensorType({DT_STRING}))
  248. .OUTPUT(y, TensorType({DT_INT32}))
  249. .ATTR(unit, String, "BYTE")
  250. .OP_END_FACTORY_REG(StringLength)
  251. /**
  252. *@brief Joins the strings in the given list of string tensors into one tensor . \n
  253. *@par Inputs:
  254. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  255. *which is applied to every element of the input tensor. The boolean values
  256. *(True or False) of the output tensor indicate if the input matches the regex
  257. *pattern provided. The pattern follows the re2 syntax
  258. *(https://github.com/google/re2/wiki/Syntax).:
  259. include:
  260. *x:A list of string tensors. The tensors must all have the same shape,
  261. *or be scalars. Scalars may be mixed in; these will be broadcast to the shape
  262. *of non-scalar inputs . It's a dynamic input. \n
  263. *@par Attributes:
  264. *@li N:The length of input x.
  265. *@li separator:string, an optional join separator . \n
  266. *@par Outputs:
  267. *y:The output tensor . \n
  268. *@see StringJoin()
  269. *@par Third-party framework compatibility
  270. *compatible with StringJoin op of tensorflow
  271. *@par Restrictions:
  272. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  273. */
  274. REG_OP(StringJoin)
  275. .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
  276. .OUTPUT(y, TensorType({DT_STRING}))
  277. .REQUIRED_ATTR(N, Int)
  278. .ATTR(separator, String, "")
  279. .OP_END_FACTORY_REG(StringJoin)
  280. /**
  281. *@brief Formats a string template using a list of tensors . \n
  282. *@par Inputs:
  283. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  284. *which is applied to every element of the input tensor.
  285. *The boolean values (True or False) of the output tensor indicate if the input
  286. *matches the regex pattern provided. The pattern follows the re2 syntax
  287. *(https://github.com/google/re2/wiki/Syntax).:
  288. include:
  289. *x:The tensors to format into the placeholder string . It's a dynamic input. \n
  290. *@par Attributes:
  291. *@li template:A string, the template to format tensor summaries into.
  292. *@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted.
  293. *@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension . \n
  294. *@par Outputs:
  295. *y:The resulting string scalar . \n
  296. *@see StringFormat()
  297. *@par Third-party framework compatibility
  298. * compatible with StringFormat op of tensorflow
  299. *@par Restrictions:
  300. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  301. */
  302. REG_OP(StringFormat)
  303. .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
  304. DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \
  305. DT_FLOAT, DT_DOUBLE, DT_BOOL}))
  306. .OUTPUT(y, TensorType({DT_STRING}))
  307. .ATTR(template, String, "%s")
  308. .ATTR(placeholder, String, "%s")
  309. .ATTR(summarize, Int, 3)
  310. .OP_END_FACTORY_REG(StringFormat)
  311. /**
  312. *@brief Check if the input matches the regex pattern . \n
  313. *@par Inputs:
  314. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  315. *which is applied to every element of the input tensor. The boolean values
  316. *(True or False) of the output tensor indicate if the input matches the regex
  317. *pattern provided. The pattern follows the re2 syntax
  318. *(https://github.com/google/re2/wiki/Syntax).:
  319. include:
  320. *@li x:A string tensor of the text to be processed.
  321. *@li pattern:A scalar string tensor containing the regular expression to match the input . \n
  322. *@par Outputs:
  323. *y:A bool tensor with the same shape as input . \n
  324. *@see RegexFullMatch()
  325. *@par Third-party framework compatibility
  326. *compatible with RegexFullMatch op of tensorflow
  327. *@par Restrictions:
  328. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  329. */
  330. REG_OP(RegexFullMatch)
  331. .INPUT(x, TensorType({DT_STRING}))
  332. .INPUT(pattern, TensorType({DT_STRING}))
  333. .OUTPUT(y, TensorType({DT_BOOL}))
  334. .OP_END_FACTORY_REG(RegexFullMatch)
  335. /**
  336. *@brief Replaces matches of the pattern regular expression in input with the
  337. *replacement string provided in rewrite . \n
  338. *@par Inputs:
  339. *It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).:
  340. include:
  341. *@li x:The text to be processed.
  342. *@li pattern:The regular expression to be matched in the input strings.
  343. *@li rewrite:The rewrite string to be substituted for the pattern expression
  344. *where it is matched in the input strings . \n
  345. *@par Attributes:
  346. *replace_global:If True, the replacement is global
  347. *(that is, all matches of the pattern regular expression in each input string
  348. *are rewritten), otherwise the rewrite substitution is only made for the first
  349. * pattern match . \n
  350. *@par Outputs:
  351. *y:The text after applying pattern match and rewrite substitution . \n
  352. *@see RegexReplace()
  353. *@par Third-party framework compatibility
  354. *compatible with RegexReplace op of tensorflow
  355. *@par Restrictions:
  356. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  357. */
  358. REG_OP(RegexReplace)
  359. .INPUT(x, TensorType({DT_STRING}))
  360. .INPUT(pattern, TensorType({DT_STRING}))
  361. .INPUT(rewrite, TensorType({DT_STRING}))
  362. .OUTPUT(y, TensorType({DT_STRING}))
  363. .ATTR(replace_global, Bool, true)
  364. .OP_END_FACTORY_REG(RegexReplace)
  365. /**
  366. *@brief Converts each entry in the given tensor to strings . \n
  367. *@par Inputs:
  368. *Supports many numeric types and boolean.:
  369. include:
  370. *x:A tensor can be trans to string . \n
  371. *@par Attributes:
  372. *@li precision:The post-decimal precision to use for floating point numbers.
  373. *Only used if precision > -1.
  374. *@li scientific:Use scientific notation for floating point numbers.
  375. *@li shortest:Use shortest representation (either scientific or standard)
  376. *for floating point numbers..
  377. *@li width:Pad pre-decimal numbers to this width. Applies to both floating
  378. *point and integer numbers. Only used if width > -1.
  379. *@li fill:The value to pad if width > -1. If empty, pads with spaces.
  380. *Another typical value is '0'. String cannot be longer than 1 character . \n
  381. *@par Outputs:
  382. *y:The output tensor . \n
  383. *@see AsString()
  384. *@par Third-party framework compatibility
  385. *compatible with AsString op of tensorflow
  386. *@par Restrictions:
  387. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  388. */
  389. REG_OP(AsString)
  390. .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
  391. DT_DOUBLE, DT_BOOL, DT_COMPLEX64, DT_COMPLEX128}))
  392. .OUTPUT(y, TensorType({DT_STRING}))
  393. .ATTR(precision, Int, -1)
  394. .ATTR(scientific, Bool, false)
  395. .ATTR(shortest, Bool, false)
  396. .ATTR(width, Int, -1)
  397. .ATTR(fill, String, "")
  398. .OP_END_FACTORY_REG(AsString)
  399. /**
  400. *@brief Encode strings into web-safe base64 format . \n
  401. *@par Inputs:
  402. *Input may or may not have padding at the end. See EncodeBase64 for padding.
  403. *Web-safe means that input must use - and _ instead of + and /.:
  404. include:
  405. *x:Strings to be encoded . \n
  406. *@par Attributes:
  407. *pad:Bool whether padding is applied at the ends . \n
  408. *@par Outputs:
  409. *y:Input strings encoded in base64 . \n
  410. *@attention Constraints:
  411. *Refer to the following article for more information on base64 format:
  412. *en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='
  413. *at the end so that the encoded has length multiple of 4.
  414. *See Padding section of the link above. Web-safe means that the encoder
  415. *uses - and _ instead of + and / . \n
  416. *@see EncodeBase64()
  417. *@par Third-party framework compatibility
  418. *compatible with EncodeBase64 op of tensorflow
  419. *@par Restrictions:
  420. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  421. */
  422. REG_OP(EncodeBase64)
  423. .INPUT(x, TensorType({DT_STRING}))
  424. .OUTPUT(y, TensorType({DT_STRING}))
  425. .ATTR(pad, Bool, false)
  426. .OP_END_FACTORY_REG(EncodeBase64)
  427. /**
  428. *@brief Decode web-safe base64-encoded strings . \n
  429. *@par Inputs:
  430. *Input may or may not have padding at the end. See EncodeBase64 for padding.
  431. *Web-safe means that input must use - and _ instead of + and /.:
  432. include:
  433. *x:Base64 strings to decode . \n
  434. *@par Outputs:
  435. *y:Decoded strings . \n
  436. *@see DecodeBase64()
  437. *@par Third-party framework compatibility
  438. *compatible with DecodeBase64 op of tensorflow
  439. *@par Restrictions:
  440. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  441. */
  442. REG_OP(DecodeBase64)
  443. .INPUT(x, TensorType({DT_STRING}))
  444. .OUTPUT(y, TensorType({DT_STRING}))
  445. .OP_END_FACTORY_REG(DecodeBase64)
  446. } // namespace ge
  447. #endif // OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示