You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

string_ops.h 33 kB

5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*!
  17. * \file string_ops.h
  18. * \brief
  19. */
  20. #ifndef OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
  21. #define OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
  22. #include <sstream>
  23. #include "graph/operator_reg.h"
  24. namespace ge {
  25. /**
  26. *@brief Creates ngrams from ragged string data . \n
  27. *@par Inputs:
  28. include:
  29. *@li data:1-D.The values tensor of the ragged string tensor to make ngrams out of.
  30. *@li data_splits:The splits tensor of the ragged string tensor to make ngrams out of . \n
  31. *@par Attributes:
  32. * separator:The string to append between elements of the token. Use "" for no separator.
  33. * ngram_widths:The sizes of the ngrams to create.
  34. * left_pad:The string to use to pad the left side of the ngram sequence. Only used if pad_width != 0.
  35. * right_pad:The string to use to pad the right side of the ngram sequence. Only used if pad_width != 0.
  36. * pad_width:The number of padding elements to add to each side of each sequence.
  37. * preserve_short_sequences: Preserve short sequences. \n
  38. *@par Outputs:
  39. *@li ngrams:The values tensor of the output ngrams ragged tensor.
  40. *@li ngrams_splits:The splits tensor of the output ngrams ragged tensor. \n
  41. *@see StringNGrams()
  42. *@par Third-party framework compatibility
  43. *compatible with StringNGrams op of tensorflow
  44. *@par Restrictions:
  45. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  46. */
  47. REG_OP(StringNGrams)
  48. .INPUT(data, TensorType({DT_STRING}))
  49. .INPUT(data_splits, TensorType({DT_INT32, DT_INT64}))
  50. .OUTPUT(ngrams, TensorType({DT_STRING}))
  51. .OUTPUT(ngrams_splits, TensorType({DT_INT32, DT_INT64}))
  52. .REQUIRED_ATTR(separator, String)
  53. .ATTR(ngram_widths, ListInt, {})
  54. .REQUIRED_ATTR(left_pad, String)
  55. .REQUIRED_ATTR(right_pad, String)
  56. .REQUIRED_ATTR(pad_width, Int)
  57. .REQUIRED_ATTR(preserve_short_sequences, Bool)
  58. .OP_END_FACTORY_REG(StringNGrams)
  59. /**
  60. *@brief Decodes each string in `input` into a sequence of Unicode code points . \n
  61. *@par Inputs:
  62. include:
  63. *@li input:The text to be decoded. Can have any shape. Note that the output is flattened
  64. to a vector of char values. \n
  65. *@par Attributes:
  66. * input_encoding:Text encoding of the input strings. This is any of the encodings supported
  67. by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
  68. * errors:Error handling policy when there is invalid formatting found in the input.
  69. The value of 'strict' will cause the operation to produce a InvalidArgument
  70. error on any invalid input formatting. A value of 'replace' (the default) will
  71. cause the operation to replace any invalid formatting in the input with the
  72. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  73. skip any invalid formatting in the input and produce no corresponding output
  74. character.
  75. * replacement_char:The replacement character codepoint to be used in place of any invalid
  76. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  77. be used. The default value is the default unicode replacement character is
  78. 0xFFFD or U+65533.
  79. * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
  80. `replacement_char`. Default is false. \n
  81. *@par Outputs:
  82. *@li row_splits:A 1D tensor containing the row splits.
  83. *@li char_values:A 1D tensor containing the decoded codepoints.
  84. *@li char_to_byte_starts:A 1D int32 Tensor containing the byte index in the input string where each
  85. character in `char_values` starts. \n
  86. *@see UnicodeDecodeWithOffsets()
  87. *@par Third-party framework compatibility
  88. *compatible with UnicodeDecodeWithOffsets op of tensorflow
  89. *@par Restrictions:
  90. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  91. */
  92. REG_OP(UnicodeDecodeWithOffsets)
  93. .INPUT(input, TensorType({DT_STRING}))
  94. .OUTPUT(row_splits, TensorType({DT_INT64}))
  95. .OUTPUT(char_values, TensorType({DT_INT32}))
  96. .OUTPUT(char_to_byte_starts, TensorType({DT_INT64}))
  97. .REQUIRED_ATTR(input_encoding, String)
  98. .ATTR(errors, String, "replace")
  99. .ATTR(replacement_char, Int, 65533)
  100. .ATTR(replace_control_characters, Bool, false)
  101. .ATTR(Tsplits, Type, DT_INT64)
  102. .OP_END_FACTORY_REG(UnicodeDecodeWithOffsets)
  103. /**
  104. *@brief Decodes each string in `input` into a sequence of Unicode code points. \n
  105. *@par Inputs:
  106. include:
  107. *@li input:The text to be decoded. Can have any shape. Note that the output is flattened
  108. to a vector of char values. \n
  109. *@par Attributes:
  110. * input_encoding:Text encoding of the input strings. This is any of the encodings supported
  111. by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
  112. * errors:Error handling policy when there is invalid formatting found in the input.
  113. The value of 'strict' will cause the operation to produce a InvalidArgument
  114. error on any invalid input formatting. A value of 'replace' (the default) will
  115. cause the operation to replace any invalid formatting in the input with the
  116. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  117. skip any invalid formatting in the input and produce no corresponding output
  118. character.
  119. * replacement_char:The replacement character codepoint to be used in place of any invalid
  120. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  121. be used. The default value is the default unicode replacement character is
  122. 0xFFFD or U+65533.
  123. * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
  124. `replacement_char`. Default is false. \n
  125. *@par Outputs:
  126. *@li row_splits:A 1D tensor containing the row splits.
  127. *@li char_values:A 1D tensor containing the decoded codepoints. \n
  128. *@see UnicodeDecode()
  129. *@par Third-party framework compatibility
  130. *compatible with UnicodeDecode op of tensorflow
  131. *@par Restrictions:
  132. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  133. */
  134. REG_OP(UnicodeDecode)
  135. .INPUT(input, TensorType({DT_STRING}))
  136. .OUTPUT(row_splits, TensorType({DT_INT64}))
  137. .OUTPUT(char_values, TensorType({DT_INT32}))
  138. .REQUIRED_ATTR(input_encoding, String)
  139. .ATTR(errors, String, "replace")
  140. .ATTR(replacement_char, Int, 65533)
  141. .ATTR(replace_control_characters, Bool, false)
  142. .ATTR(Tsplits, Type, DT_INT64)
  143. .OP_END_FACTORY_REG(UnicodeDecode)
  144. /**
  145. *@brief Transcode the input text from a source encoding to a destination encoding. \n
  146. *@par Inputs:
  147. include:
  148. *@li input:The text to be processed. Can have any shape. \n
  149. *@par Attributes:
  150. * input_encoding:Text encoding of the input strings. This is any of the encodings supported
  151. by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
  152. * output_encoding:The unicode encoding to use in the output. Must be one of `"UTF-8", "UTF-16-BE", "UTF-32-BE"`.
  153. Multi-byte encodings will be big-endian.
  154. * errors:Error handling policy when there is invalid formatting found in the input.
  155. The value of 'strict' will cause the operation to produce a InvalidArgument
  156. error on any invalid input formatting. A value of 'replace' (the default) will
  157. cause the operation to replace any invalid formatting in the input with the
  158. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  159. skip any invalid formatting in the input and produce no corresponding output
  160. character.
  161. * replacement_char:The replacement character codepoint to be used in place of any invalid
  162. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  163. be used. The default value is the default unicode replacement character is
  164. 0xFFFD or U+65533.
  165. * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
  166. `replacement_char`. Default is false. \n
  167. *@par Outputs:
  168. *@li output:A string tensor containing unicode text encoded using `output_encoding`. \n
  169. *@see UnicodeTranscode()
  170. *@par Third-party framework compatibility
  171. *compatible with UnicodeTranscode op of tensorflow
  172. *@par Restrictions:
  173. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  174. */
  175. REG_OP(UnicodeTranscode)
  176. .INPUT(input, TensorType({DT_STRING}))
  177. .OUTPUT(output, TensorType({DT_STRING}))
  178. .REQUIRED_ATTR(input_encoding, String)
  179. .ATTR(output_encoding, String, "UTF-8")
  180. .ATTR(errors, String, "replace")
  181. .ATTR(replacement_char, Int, 65533)
  182. .ATTR(replace_control_characters, Bool, false)
  183. .OP_END_FACTORY_REG(UnicodeTranscode)
  184. /**
  185. *@brief Encode a tensor of ints into unicode strings. \n
  186. *@par Inputs:
  187. include:
  188. *@li input_values:A 1D tensor containing the unicode codepoints that should be encoded.
  189. *@li input_splits:A 1D tensor specifying how the unicode codepoints should be split into strings. \n
  190. *@par Attributes:
  191. * output_encoding:The unicode encoding to use in the output. Must be one of `"UTF-8", "UTF-16-BE", "UTF-32-BE"`.
  192. Multi-byte encodings will be big-endian.
  193. * errors:Error handling policy when there is invalid formatting found in the input.
  194. The value of 'strict' will cause the operation to produce a InvalidArgument
  195. error on any invalid input formatting. A value of 'replace' (the default) will
  196. cause the operation to replace any invalid formatting in the input with the
  197. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  198. skip any invalid formatting in the input and produce no corresponding output
  199. character.
  200. * replacement_char:The replacement character codepoint to be used in place of any invalid
  201. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  202. be used. The default value is the default unicode replacement character is
  203. 0xFFFD or U+65533. \n
  204. *@par Outputs:
  205. *@li output:The 1-D Tensor of strings encoded from the provided unicode codepoints. \n
  206. *@see UnicodeEncode()
  207. *@par Third-party framework compatibility
  208. *compatible with UnicodeEncode op of tensorflow
  209. *@par Restrictions:
  210. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  211. */
  212. REG_OP(UnicodeEncode)
  213. .INPUT(input_values, TensorType({DT_INT32}))
  214. .INPUT(input_splits, TensorType({DT_INT32, DT_INT64}))
  215. .OUTPUT(output, TensorType({DT_STRING}))
  216. .ATTR(errors, String, "replace")
  217. .ATTR(output_encoding, String, "UTF-8")
  218. .ATTR(replacement_char, Int, 65533)
  219. .OP_END_FACTORY_REG(UnicodeEncode)
  220. /**
  221. *@brief Split elements of input based on delimiter into a SparseTensor . \n
  222. *@par Inputs:
  223. include:
  224. *@li input:1-D. Strings to split.
  225. *@li delimiter:0-D. Delimiter characters (bytes), or empty string . \n
  226. *@par Attributes:
  227. * skip_empty:A bool. If True, skip the empty strings from the result . \n
  228. *@par Outputs:
  229. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  230. *@li values:A vector of strings corresponding to the splited values.
  231. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,
  232. *where the first value is N and the second value is the maximum number of tokens
  233. *in a single input entry . \n
  234. *@see StringSplit()
  235. *@par Third-party framework compatibility
  236. *compatible with StringSplit op of tensorflow
  237. *@par Restrictions:
  238. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  239. */
  240. REG_OP(StringSplit)
  241. .INPUT(input, TensorType({DT_STRING}))
  242. .INPUT(delimiter, TensorType({DT_STRING}))
  243. .OUTPUT(indices, TensorType({DT_INT64}))
  244. .OUTPUT(values, TensorType({DT_STRING}))
  245. .OUTPUT(shape, TensorType({DT_INT64}))
  246. .ATTR(skip_empty, Bool, true)
  247. .OP_END_FACTORY_REG(StringSplit)
  248. /**
  249. *@brief Replaces the match of pattern in input with rewrite. \n
  250. *@par Inputs:
  251. include:
  252. *@li input:A Tensor of type string. The text to be processed. \n
  253. *@par Attributes:
  254. *@li pattern:A string. The regular expression to match the input.
  255. *@li rewrite:A string. The rewrite to be applied to the matched expression.
  256. *@li replace_global:An optional bool. Defaults to True. If True, the replacement is global,
  257. otherwise the replacement is done only on the first match.
  258. *@par output:
  259. *@li output::A Tensor of type string.
  260. */
  261. REG_OP(StaticRegexReplace)
  262. .INPUT(input, TensorType({DT_STRING}))
  263. .OUTPUT(output, TensorType({DT_STRING}))
  264. .ATTR(pattern, String, "")
  265. .ATTR(rewrite, String, "")
  266. .ATTR(replace_global, Bool, true)
  267. .OP_END_FACTORY_REG(StaticRegexReplace)
  268. /**
  269. *@brief The input is a string tensor of any shape. The pattern is the
  270. *regular expression to be matched with every element of the input tensor.
  271. *The boolean values (True or False) of the output tensor indicate
  272. *if the input matches the regex pattern provided.
  273. *@par Inputs:
  274. include:
  275. *@li input:A Tensor of type string. The text to be processed. \n
  276. *@par Attributes:
  277. *@li pattern:A string. The regular expression to match the input.
  278. *@par output:
  279. *@li output::A bool tensor with the same shape as `input`.
  280. */
  281. REG_OP(StaticRegexFullMatch)
  282. .INPUT(input, TensorType({DT_STRING}))
  283. .OUTPUT(output, TensorType({DT_BOOL}))
  284. .ATTR(pattern, String, "")
  285. .OP_END_FACTORY_REG(StaticRegexFullMatch)
  286. /**
  287. *@brief A Tensor of type string. The input to be joined. \n
  288. *@par Inputs:
  289. include:
  290. *@li input:A Tensor of type string. The text to be processed.
  291. *@li segment_ids:A Tensor. Must be one of the following types: int32, int64.
  292. *A tensor whose shape is a prefix of data.shape. Negative segment ids are not supported.
  293. *@li num_segments:A Tensor. Must be one of the following types: int32, int64. A scalar.
  294. *@par Attributes:
  295. *@li separator:An optional string. Defaults to "". The separator to use when joining.
  296. *@par output:
  297. *@li output::A Tensor of type string..
  298. */
  299. REG_OP(UnsortedSegmentJoin)
  300. .INPUT(input, TensorType({DT_STRING}))
  301. .INPUT(segment_ids, TensorType({DT_INT32,DT_INT64}))
  302. .INPUT(num_segments, TensorType({DT_INT32,DT_INT64}))
  303. .OUTPUT(output, TensorType({DT_STRING}))
  304. .ATTR(separator, String, "")
  305. .OP_END_FACTORY_REG(UnsortedSegmentJoin)
  306. /**
  307. *@brief Inputs to TensorFlow operations are outputs of another TensorFlow operation.
  308. *This method is used to obtain a symbolic handle that represents the computation of the input.
  309. *@par Inputs:
  310. include:
  311. *@li input:A Tensor of type string. The text to be processed.
  312. *@par Attributes:
  313. *@li encoding:An optional string. Defaults to "".
  314. *@par output:
  315. *@li output::A Tensor of type string..
  316. */
  317. REG_OP(StringLower)
  318. .INPUT(input, TensorType({DT_STRING}))
  319. .OUTPUT(output, TensorType({DT_STRING}))
  320. .ATTR(encoding, String, "")
  321. .OP_END_FACTORY_REG(StringLower)
  322. /**
  323. *@brief Inputs to TensorFlow operations are outputs of another TensorFlow operation.
  324. *This method is used to obtain a symbolic handle that represents the computation of the input.
  325. *@par Inputs:
  326. include:
  327. *@li input:A Tensor of type string. The text to be processed.
  328. *@par Attributes:
  329. *@li encoding:An optional string. Defaults to "".
  330. *@par output:
  331. *@li output::A Tensor of type string..
  332. */
  333. REG_OP(StringUpper)
  334. .INPUT(input, TensorType({DT_STRING}))
  335. .OUTPUT(output, TensorType({DT_STRING}))
  336. .ATTR(encoding, String, "")
  337. .OP_END_FACTORY_REG(StringUpper)
  338. /**
  339. *@brief Split elements of source based on sep into a SparseTensor . \n
  340. *@par Inputs:
  341. include:
  342. *@li input:1-D. Strings to split.
  343. *@li sep:0-D string Tensor, the delimiter character . \n
  344. *@par Attributes:
  345. * maxsplit:An int. If maxsplit > 0, limit of the split of the result . \n
  346. *@par Outputs:
  347. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  348. *@li values:A vector of strings corresponding to the splited values.
  349. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,
  350. *where the first value is N and the second value is the maximum number of tokens
  351. *in a single input entry . \n
  352. *@see StringSplitV2()
  353. *@par Third-party framework compatibility
  354. *compatible with StringSplitV2 op of tensorflow
  355. *@par Restrictions:
  356. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  357. */
  358. REG_OP(StringSplitV2)
  359. .INPUT(input, TensorType({DT_STRING}))
  360. .INPUT(sep, TensorType({DT_STRING}))
  361. .OUTPUT(indices, TensorType({DT_INT64}))
  362. .OUTPUT(values, TensorType({DT_STRING}))
  363. .OUTPUT(shape, TensorType({DT_INT64}))
  364. .ATTR(maxsplit, Int, -1)
  365. .OP_END_FACTORY_REG(StringSplitV2)
  366. /**
  367. *@brief Determine the script codes of a given tensor of Unicode integer code points . \n
  368. *@par Inputs:
  369. include:
  370. *x:A Tensor of int32 Unicode code points . \n
  371. *@par Outputs:
  372. *y:A Tensor of int32 script codes corresponding to each input code point . \n
  373. *@attention Constraints:
  374. *This operation converts Unicode code points to script codes corresponding to
  375. *each code point. Script codes correspond to International Components for
  376. *Unicode (ICU) UScriptCode values.
  377. *See http://icu-project.org/apiref/icu4c/uscript_8h.html.
  378. *Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.
  379. *Output shape will match input shape . \n
  380. *@see UnicodeScript()
  381. *@par Third-party framework compatibility
  382. *compatible with UnicodeScript op of tensorflow
  383. *@par Restrictions:
  384. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  385. */
  386. REG_OP(UnicodeScript)
  387. .INPUT(x, TensorType({DT_INT32}))
  388. .OUTPUT(y, TensorType({DT_INT32}))
  389. .OP_END_FACTORY_REG(UnicodeScript)
  390. /**
  391. *@brief Return substrings from Tensor of strings . \n
  392. *@par Inputs:
  393. include:
  394. *@li input:Tensor of strings.
  395. *@li pos:Scalar defining the position of first character in each substring.
  396. *@li len:Scalar defining the number of characters to include in each substring . \n
  397. *@par Outputs:
  398. *output:Tensor of substrings . \n
  399. *@attention Constraints:
  400. *The hash function is deterministic on the content of the string within
  401. *the process and will never change. However, it is not suitable for
  402. *cryptography. This function may be used when CPU time is scarce and
  403. *inputs are trusted or unimportant. There is a risk of adversaries
  404. *constructing inputs that all hash to the same bucket.
  405. *To prevent this problem, use a strong hash function with
  406. *tf.string_to_hash_bucket_strong . \n
  407. *@see Substr()
  408. *@par Third-party framework compatibility
  409. *compatible with Substr op of tensorflow
  410. *@par Restrictions:
  411. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  412. */
  413. REG_OP(Substr)
  414. .INPUT(input, TensorType({DT_STRING}))
  415. .INPUT(pos, TensorType({DT_INT32, DT_INT64}))
  416. .INPUT(len, TensorType({DT_INT32, DT_INT64}))
  417. .OUTPUT(output, TensorType({DT_STRING}))
  418. .OP_END_FACTORY_REG(Substr)
  419. /**
  420. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  421. *@par Inputs:
  422. include:
  423. *string_tensor:The strings to assign a hash bucket . \n
  424. *@par Outputs:
  425. *y:A Tensor of the same shape as the input x . \n
  426. *@attention Constraints:
  427. *The hash function is deterministic on the content of the string within
  428. *the process and will never change. However, it is not suitable for cryptography.
  429. *This function may be used when CPU time is scarce and inputs are trusted or
  430. *unimportant. There is a risk of adversaries constructing inputs that all hash
  431. *to the same bucket. To prevent this problem, use a strong hash function with
  432. *tf.string_to_hash_bucket_strong . \n
  433. *@see StringToHashBucketFast()
  434. *@par Third-party framework compatibility
  435. *compatible with StringToHashBucketFast op of tensorflow
  436. *@par Restrictions:
  437. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  438. */
  439. REG_OP(StringToHashBucketFast)
  440. .INPUT(x, TensorType({DT_STRING}))
  441. .OUTPUT(y, TensorType({DT_INT64}))
  442. .ATTR(num_buckets, Int, 1)
  443. .OP_END_FACTORY_REG(StringToHashBucketFast)
  444. /**
  445. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  446. *@par Inputs:
  447. include:
  448. *x:The strings to assign a hash bucket . \n
  449. *@par Attributes:
  450. *num_buckets:The number of buckets . \n
  451. *@par Outputs:
  452. *y:A Tensor of the same shape as the input x . \n
  453. *@attention Constraints:
  454. *@li A strong hash is important when inputs may be malicious, e.g. URLs with
  455. *additional components. Adversaries could try to make their inputs hash to
  456. *the same bucket for a denial-of-service attack or to skew the results.
  457. *A strong hash can be used to make it difficult to find inputs with a skewed
  458. * hash value distribution over buckets. This requires that the hash function\
  459. *is seeded by a high-entropy (random) "key" unknown to the adversary.
  460. *@li The additional robustness comes at a cost of roughly 4x higher
  461. *compute time than tf.string_to_hash_bucket_fast . \n
  462. *@see StringToHashBucketStrong()
  463. *@par Third-party framework compatibility
  464. *compatible with StringToHashBucketStrong op of tensorflow
  465. *@par Restrictions:
  466. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  467. */
  468. REG_OP(StringToHashBucketStrong)
  469. .INPUT(x, TensorType({DT_STRING}))
  470. .OUTPUT(y, TensorType({DT_INT64}))
  471. .ATTR(num_buckets, Int, 1)
  472. .REQUIRED_ATTR(key, ListInt)
  473. .OP_END_FACTORY_REG(StringToHashBucketStrong)
  474. /**
  475. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  476. *@par Inputs:
  477. include:
  478. *string_tensor:The strings to assign a hash bucket . \n
  479. *@par Attributes:
  480. *num_buckets:The number of buckets . \n
  481. *@par Outputs:
  482. *y:A Tensor of the same shape as the input string_tensor . \n
  483. *@see StringToHashBucket()
  484. *@par Third-party framework compatibility
  485. *compatible with StringToHashBucket op of tensorflow
  486. *@par Restrictions:
  487. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  488. */
  489. REG_OP(StringToHashBucket)
  490. .INPUT(string_tensor, TensorType({DT_STRING}))
  491. .OUTPUT(y, TensorType({DT_INT64}))
  492. .ATTR(num_buckets, Int, 1)
  493. .OP_END_FACTORY_REG(StringToHashBucket)
  494. /**
  495. *@brief Strip leading and trailing whitespaces from the Tensor . \n
  496. *@par Inputs:
  497. include:
  498. *x:A string Tensor of any shape . \n
  499. *@par Outputs:
  500. *y:A string Tensor of the same shape as the input . \n
  501. *@see StringStrip()
  502. *@par Third-party framework compatibility
  503. *compatible with StringStrip op of tensorflow
  504. *@par Restrictions:
  505. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  506. */
  507. REG_OP(StringStrip)
  508. .INPUT(x, TensorType({DT_STRING}))
  509. .OUTPUT(y, TensorType({DT_STRING}))
  510. .OP_END_FACTORY_REG(StringStrip)
  511. /**
  512. *@brief Computes the length of each string given in the input tensor . \n
  513. *@par Inputs:
  514. include:
  515. *x:The string for which to compute the length . \n
  516. *@par Attributes:
  517. *unit:The unit that is counted to compute string length.
  518. *One of: "BYTE" (for the number of bytes in each string) or
  519. *"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).
  520. *Results are undefined if unit=UTF8_CHAR and the input strings do not contain
  521. *structurally valid UTF-8 . \n
  522. *@par Outputs:
  523. *y:Integer tensor that has the same shape as input.
  524. *The output contains the element-wise string lengths of input . \n
  525. *@see StringLength()
  526. *@par Third-party framework compatibility
  527. *compatible with StringLength op of tensorflow
  528. *@par Restrictions:
  529. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  530. */
  531. REG_OP(StringLength)
  532. .INPUT(x, TensorType({DT_STRING}))
  533. .OUTPUT(y, TensorType({DT_INT32}))
  534. .ATTR(unit, String, "BYTE")
  535. .OP_END_FACTORY_REG(StringLength)
  536. /**
  537. *@brief Joins the strings in the given list of string tensors into one tensor . \n
  538. *@par Inputs:
  539. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  540. *which is applied to every element of the input tensor. The boolean values
  541. *(True or False) of the output tensor indicate if the input matches the regex
  542. *pattern provided. The pattern follows the re2 syntax
  543. *(https://github.com/google/re2/wiki/Syntax).:
  544. include:
  545. *x:A list of string tensors. The tensors must all have the same shape,
  546. *or be scalars. Scalars may be mixed in; these will be broadcast to the shape
  547. *of non-scalar inputs . It's a dynamic input. \n
  548. *@par Attributes:
  549. *@li N:The length of input x.
  550. *@li separator:string, an optional join separator . \n
  551. *@par Outputs:
  552. *y:The output tensor . \n
  553. *@see StringJoin()
  554. *@par Third-party framework compatibility
  555. *compatible with StringJoin op of tensorflow
  556. *@par Restrictions:
  557. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  558. */
  559. REG_OP(StringJoin)
  560. .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
  561. .OUTPUT(y, TensorType({DT_STRING}))
  562. .REQUIRED_ATTR(N, Int)
  563. .ATTR(separator, String, "")
  564. .OP_END_FACTORY_REG(StringJoin)
  565. /**
  566. *@brief Formats a string template using a list of tensors . \n
  567. *@par Inputs:
  568. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  569. *which is applied to every element of the input tensor.
  570. *The boolean values (True or False) of the output tensor indicate if the input
  571. *matches the regex pattern provided. The pattern follows the re2 syntax
  572. *(https://github.com/google/re2/wiki/Syntax).:
  573. include:
  574. *x:The tensors to format into the placeholder string . It's a dynamic input. \n
  575. *@par Attributes:
  576. *@li template:A string, the template to format tensor summaries into.
  577. *@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted.
  578. *@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension . \n
  579. *@par Outputs:
  580. *y:The resulting string scalar . \n
  581. *@see StringFormat()
  582. *@par Third-party framework compatibility
  583. * compatible with StringFormat op of tensorflow
  584. *@par Restrictions:
  585. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  586. */
  587. REG_OP(StringFormat)
  588. .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
  589. DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \
  590. DT_FLOAT, DT_DOUBLE, DT_BOOL}))
  591. .OUTPUT(y, TensorType({DT_STRING}))
  592. .ATTR(template, String, "%s")
  593. .ATTR(placeholder, String, "%s")
  594. .ATTR(summarize, Int, 3)
  595. .OP_END_FACTORY_REG(StringFormat)
  596. /**
  597. *@brief Check if the input matches the regex pattern . \n
  598. *@par Inputs:
  599. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  600. *which is applied to every element of the input tensor. The boolean values
  601. *(True or False) of the output tensor indicate if the input matches the regex
  602. *pattern provided. The pattern follows the re2 syntax
  603. *(https://github.com/google/re2/wiki/Syntax).:
  604. include:
  605. *@li x:A string tensor of the text to be processed.
  606. *@li pattern:A scalar string tensor containing the regular expression to match the input . \n
  607. *@par Outputs:
  608. *y:A bool tensor with the same shape as input . \n
  609. *@see RegexFullMatch()
  610. *@par Third-party framework compatibility
  611. *compatible with RegexFullMatch op of tensorflow
  612. *@par Restrictions:
  613. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  614. */
  615. REG_OP(RegexFullMatch)
  616. .INPUT(x, TensorType({DT_STRING}))
  617. .INPUT(pattern, TensorType({DT_STRING}))
  618. .OUTPUT(y, TensorType({DT_BOOL}))
  619. .OP_END_FACTORY_REG(RegexFullMatch)
  620. /**
  621. *@brief Replaces matches of the pattern regular expression in input with the
  622. *replacement string provided in rewrite . \n
  623. *@par Inputs:
  624. *It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).:
  625. include:
  626. *@li x:The text to be processed.
  627. *@li pattern:The regular expression to be matched in the input strings.
  628. *@li rewrite:The rewrite string to be substituted for the pattern expression
  629. *where it is matched in the input strings . \n
  630. *@par Attributes:
  631. *replace_global:If True, the replacement is global
  632. *(that is, all matches of the pattern regular expression in each input string
  633. *are rewritten), otherwise the rewrite substitution is only made for the first
  634. * pattern match . \n
  635. *@par Outputs:
  636. *y:The text after applying pattern match and rewrite substitution . \n
  637. *@see RegexReplace()
  638. *@par Third-party framework compatibility
  639. *compatible with RegexReplace op of tensorflow
  640. *@par Restrictions:
  641. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  642. */
  643. REG_OP(RegexReplace)
  644. .INPUT(x, TensorType({DT_STRING}))
  645. .INPUT(pattern, TensorType({DT_STRING}))
  646. .INPUT(rewrite, TensorType({DT_STRING}))
  647. .OUTPUT(y, TensorType({DT_STRING}))
  648. .ATTR(replace_global, Bool, true)
  649. .OP_END_FACTORY_REG(RegexReplace)
  650. /**
  651. *@brief Converts each entry in the given tensor to strings . \n
  652. *@par Inputs:
  653. *Supports many numeric types and boolean.:
  654. include:
  655. *x:A tensor can be trans to string . \n
  656. *@par Attributes:
  657. *@li precision:The post-decimal precision to use for floating point numbers.
  658. *Only used if precision > -1.
  659. *@li scientific:Use scientific notation for floating point numbers.
  660. *@li shortest:Use shortest representation (either scientific or standard)
  661. *for floating point numbers..
  662. *@li width:Pad pre-decimal numbers to this width. Applies to both floating
  663. *point and integer numbers. Only used if width > -1.
  664. *@li fill:The value to pad if width > -1. If empty, pads with spaces.
  665. *Another typical value is '0'. String cannot be longer than 1 character . \n
  666. *@par Outputs:
  667. *y:The output tensor . \n
  668. *@see AsString()
  669. *@par Third-party framework compatibility
  670. *compatible with AsString op of tensorflow
  671. *@par Restrictions:
  672. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  673. */
  674. REG_OP(AsString)
  675. .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
  676. DT_DOUBLE, DT_BOOL, DT_COMPLEX64, DT_COMPLEX128}))
  677. .OUTPUT(y, TensorType({DT_STRING}))
  678. .ATTR(precision, Int, -1)
  679. .ATTR(scientific, Bool, false)
  680. .ATTR(shortest, Bool, false)
  681. .ATTR(width, Int, -1)
  682. .ATTR(fill, String, "")
  683. .OP_END_FACTORY_REG(AsString)
  684. /**
  685. *@brief Encode strings into web-safe base64 format . \n
  686. *@par Inputs:
  687. *Input may or may not have padding at the end. See EncodeBase64 for padding.
  688. *Web-safe means that input must use - and _ instead of + and /.:
  689. include:
  690. *x:Strings to be encoded . \n
  691. *@par Attributes:
  692. *pad:Bool whether padding is applied at the ends . \n
  693. *@par Outputs:
  694. *y:Input strings encoded in base64 . \n
  695. *@attention Constraints:
  696. *Refer to the following article for more information on base64 format:
  697. *en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='
  698. *at the end so that the encoded has length multiple of 4.
  699. *See Padding section of the link above. Web-safe means that the encoder
  700. *uses - and _ instead of + and / . \n
  701. *@see EncodeBase64()
  702. *@par Third-party framework compatibility
  703. *compatible with EncodeBase64 op of tensorflow
  704. *@par Restrictions:
  705. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  706. */
  707. REG_OP(EncodeBase64)
  708. .INPUT(x, TensorType({DT_STRING}))
  709. .OUTPUT(y, TensorType({DT_STRING}))
  710. .ATTR(pad, Bool, false)
  711. .OP_END_FACTORY_REG(EncodeBase64)
  712. /**
  713. *@brief Decode web-safe base64-encoded strings . \n
  714. *@par Inputs:
  715. *Input may or may not have padding at the end. See EncodeBase64 for padding.
  716. *Web-safe means that input must use - and _ instead of + and /.:
  717. include:
  718. *x:Base64 strings to decode . \n
  719. *@par Outputs:
  720. *y:Decoded strings . \n
  721. *@see DecodeBase64()
  722. *@par Third-party framework compatibility
  723. *compatible with DecodeBase64 op of tensorflow
  724. *@par Restrictions:
  725. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  726. */
  727. REG_OP(DecodeBase64)
  728. .INPUT(x, TensorType({DT_STRING}))
  729. .OUTPUT(y, TensorType({DT_STRING}))
  730. .OP_END_FACTORY_REG(DecodeBase64)
  731. /**
  732. *@brief StringNormalization performs string operations for basic cleaning . \n
  733. *@par Inputs:
  734. *@li input: only accepts [C] or [1, C] UTF-8 strings tensor . \n
  735. *@par Outputs:
  736. *@li output: UTF-8 strings tensor after cleaning . \n
  737. *@par Attributes:
  738. *@li stopwords : list of strings (default is empty).
  739. *List of stop words. If not set, no word would be removed from input strings
  740. tensor.
  741. *@li is_case_sensitive : bool (default is false).
  742. *Boolean. Whether the identification of stop words in input strings tensor is
  743. case-sensitive. Default is false.
  744. *@li case_change_action : string (default is "NONE").
  745. *string enum that cases output to be lowercased/uppercases/unchanged. Valid
  746. values are "LOWER", "UPPER", "NONE". Default is "NONE".
  747. *@li local : string (default is "en_US").
  748. *Environment dependent string that denotes the locale according to which output
  749. strings needs to be upper/lowercased.Default en_US or platform specific equivalent
  750. as decided by the implementation . \n
  751. *@attention Constraints:
  752. *@li input can be either a 1-D or 2-D tensor, the shape of 2-D tensor must be [1, C].
  753. */
  754. REG_OP(StringNormalizer)
  755. .INPUT(input, TensorType({DT_STRING}))
  756. .OUTPUT(output, TensorType({DT_STRING}))
  757. .ATTR(stopwords, ListString, {})
  758. .ATTR(is_case_sensitive, Bool, false)
  759. .ATTR(case_change_action, String, "NONE")
  760. .ATTR(local, String, "en_US")
  761. .OP_END_FACTORY_REG(StringNormalizer)
  762. } // namespace ge
  763. #endif // OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示