You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

string_ops.h 31 kB

5 years ago
4 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*!
  17. * \file string_ops.h
  18. * \brief
  19. */
  20. #ifndef OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
  21. #define OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_
  22. #include <sstream>
  23. #include "graph/operator_reg.h"
  24. namespace ge {
  25. /**
  26. *@brief Creates ngrams from ragged string data . \n
  27. *@par Inputs:
  28. include:
  29. *@li data:1-D.The values tensor of the ragged string tensor to make ngrams out of.
  30. *@li data_splits:The splits tensor of the ragged string tensor to make ngrams out of . \n
  31. *@par Attributes:
  32. * separator:The string to append between elements of the token. Use "" for no separator.
  33. * ngram_widths:The sizes of the ngrams to create.
  34. * left_pad:The string to use to pad the left side of the ngram sequence. Only used if pad_width != 0.
  35. * right_pad:The string to use to pad the right side of the ngram sequence. Only used if pad_width != 0.
  36. * pad_width:The number of padding elements to add to each side of each sequence.
  37. * preserve_short_sequences: Preserve short sequences. \n
  38. *@par Outputs:
  39. *@li ngrams:The values tensor of the output ngrams ragged tensor.
  40. *@li ngrams_splits:The splits tensor of the output ngrams ragged tensor. \n
  41. *@see StringNGrams()
  42. *@par Third-party framework compatibility
  43. *compatible with StringNGrams op of tensorflow
  44. *@par Restrictions:
  45. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  46. */
  47. REG_OP(StringNGrams)
  48. .INPUT(data, TensorType({DT_STRING}))
  49. .INPUT(data_splits, TensorType({DT_INT32, DT_INT64}))
  50. .OUTPUT(ngrams, TensorType({DT_STRING}))
  51. .OUTPUT(ngrams_splits, TensorType({DT_INT32, DT_INT64}))
  52. .REQUIRED_ATTR(separator, String)
  53. .ATTR(ngram_widths, ListInt, {})
  54. .REQUIRED_ATTR(left_pad, String)
  55. .REQUIRED_ATTR(right_pad, String)
  56. .REQUIRED_ATTR(pad_width, Int)
  57. .REQUIRED_ATTR(preserve_short_sequences, Bool)
  58. .OP_END_FACTORY_REG(StringNGrams)
  59. /**
  60. *@brief Decodes each string in `input` into a sequence of Unicode code points . \n
  61. *@par Inputs:
  62. include:
  63. *@li input:The text to be decoded. Can have any shape. Note that the output is flattened
  64. to a vector of char values. \n
  65. *@par Attributes:
  66. * input_encoding:Text encoding of the input strings. This is any of the encodings supported
  67. by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
  68. * errors:Error handling policy when there is invalid formatting found in the input.
  69. The value of 'strict' will cause the operation to produce a InvalidArgument
  70. error on any invalid input formatting. A value of 'replace' (the default) will
  71. cause the operation to replace any invalid formatting in the input with the
  72. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  73. skip any invalid formatting in the input and produce no corresponding output
  74. character.
  75. * replacement_char:The replacement character codepoint to be used in place of any invalid
  76. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  77. be used. The default value is the default unicode replacement character is
  78. 0xFFFD or U+65533.
  79. * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
  80. `replacement_char`. Default is false. \n
  81. *@par Outputs:
  82. *@li row_splits:A 1D tensor containing the row splits.
  83. *@li char_values:A 1D tensor containing the decoded codepoints.
  84. *@li char_to_byte_starts:A 1D int32 Tensor containing the byte index in the input string where each
  85. character in `char_values` starts. \n
  86. *@see UnicodeDecodeWithOffsets()
  87. *@par Third-party framework compatibility
  88. *compatible with UnicodeDecodeWithOffsets op of tensorflow
  89. *@par Restrictions:
  90. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  91. */
  92. REG_OP(UnicodeDecodeWithOffsets)
  93. .INPUT(input, TensorType({DT_STRING}))
  94. .OUTPUT(row_splits, TensorType({DT_INT64}))
  95. .OUTPUT(char_values, TensorType({DT_INT32}))
  96. .OUTPUT(char_to_byte_starts, TensorType({DT_INT64}))
  97. .REQUIRED_ATTR(input_encoding, String)
  98. .ATTR(errors, String, "replace")
  99. .ATTR(replacement_char, Int, 65533)
  100. .ATTR(replace_control_characters, Bool, false)
  101. .OP_END_FACTORY_REG(UnicodeDecodeWithOffsets)
  102. /**
  103. *@brief Decodes each string in `input` into a sequence of Unicode code points. \n
  104. *@par Inputs:
  105. include:
  106. *@li input:The text to be decoded. Can have any shape. Note that the output is flattened
  107. to a vector of char values. \n
  108. *@par Attributes:
  109. * input_encoding:Text encoding of the input strings. This is any of the encodings supported
  110. by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
  111. * errors:Error handling policy when there is invalid formatting found in the input.
  112. The value of 'strict' will cause the operation to produce a InvalidArgument
  113. error on any invalid input formatting. A value of 'replace' (the default) will
  114. cause the operation to replace any invalid formatting in the input with the
  115. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  116. skip any invalid formatting in the input and produce no corresponding output
  117. character.
  118. * replacement_char:The replacement character codepoint to be used in place of any invalid
  119. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  120. be used. The default value is the default unicode replacement character is
  121. 0xFFFD or U+65533.
  122. * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
  123. `replacement_char`. Default is false. \n
  124. *@par Outputs:
  125. *@li row_splits:A 1D tensor containing the row splits.
  126. *@li char_values:A 1D tensor containing the decoded codepoints. \n
  127. *@see UnicodeDecode()
  128. *@par Third-party framework compatibility
  129. *compatible with UnicodeDecode op of tensorflow
  130. *@par Restrictions:
  131. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  132. */
  133. REG_OP(UnicodeDecode)
  134. .INPUT(input, TensorType({DT_STRING}))
  135. .OUTPUT(row_splits, TensorType({DT_INT64}))
  136. .OUTPUT(char_values, TensorType({DT_INT32}))
  137. .REQUIRED_ATTR(input_encoding, String)
  138. .ATTR(errors, String, "replace")
  139. .ATTR(replacement_char, Int, 65533)
  140. .ATTR(replace_control_characters, Bool, false)
  141. .OP_END_FACTORY_REG(UnicodeDecode)
  142. /**
  143. *@brief Transcode the input text from a source encoding to a destination encoding. \n
  144. *@par Inputs:
  145. include:
  146. *@li input:The text to be processed. Can have any shape. \n
  147. *@par Attributes:
  148. * input_encoding:Text encoding of the input strings. This is any of the encodings supported
  149. by ICU ucnv algorithmic converters. Examples: `"UTF-16", "US ASCII", "UTF-8"`.
  150. * output_encoding:The unicode encoding to use in the output. Must be one of `"UTF-8", "UTF-16-BE", "UTF-32-BE"`.
  151. Multi-byte encodings will be big-endian.
  152. * errors:Error handling policy when there is invalid formatting found in the input.
  153. The value of 'strict' will cause the operation to produce a InvalidArgument
  154. error on any invalid input formatting. A value of 'replace' (the default) will
  155. cause the operation to replace any invalid formatting in the input with the
  156. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  157. skip any invalid formatting in the input and produce no corresponding output
  158. character.
  159. * replacement_char:The replacement character codepoint to be used in place of any invalid
  160. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  161. be used. The default value is the default unicode replacement character is
  162. 0xFFFD or U+65533.
  163. * replace_control_characters:Whether to replace the C0 control characters (00-1F) with the
  164. `replacement_char`. Default is false. \n
  165. *@par Outputs:
  166. *@li output:A string tensor containing unicode text encoded using `output_encoding`. \n
  167. *@see UnicodeTranscode()
  168. *@par Third-party framework compatibility
  169. *compatible with UnicodeTranscode op of tensorflow
  170. *@par Restrictions:
  171. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  172. */
  173. REG_OP(UnicodeTranscode)
  174. .INPUT(input, TensorType({DT_STRING}))
  175. .OUTPUT(output, TensorType({DT_STRING}))
  176. .REQUIRED_ATTR(input_encoding, String)
  177. .ATTR(output_encoding, String, "UTF-8")
  178. .ATTR(errors, String, "replace")
  179. .ATTR(replacement_char, Int, 65533)
  180. .ATTR(replace_control_characters, Bool, false)
  181. .OP_END_FACTORY_REG(UnicodeTranscode)
  182. /**
  183. *@brief Encode a tensor of ints into unicode strings. \n
  184. *@par Inputs:
  185. include:
  186. *@li input_values:A 1D tensor containing the unicode codepoints that should be encoded.
  187. *@li input_splits:A 1D tensor specifying how the unicode codepoints should be split into strings. \n
  188. *@par Attributes:
  189. * output_encoding:The unicode encoding to use in the output. Must be one of `"UTF-8", "UTF-16-BE", "UTF-32-BE"`.
  190. Multi-byte encodings will be big-endian.
  191. * errors:Error handling policy when there is invalid formatting found in the input.
  192. The value of 'strict' will cause the operation to produce a InvalidArgument
  193. error on any invalid input formatting. A value of 'replace' (the default) will
  194. cause the operation to replace any invalid formatting in the input with the
  195. `replacement_char` codepoint. A value of 'ignore' will cause the operation to
  196. skip any invalid formatting in the input and produce no corresponding output
  197. character.
  198. * replacement_char:The replacement character codepoint to be used in place of any invalid
  199. formatting in the input when `errors='replace'`. Any valid unicode codepoint may
  200. be used. The default value is the default unicode replacement character is
  201. 0xFFFD or U+65533. \n
  202. *@par Outputs:
  203. *@li output:The 1-D Tensor of strings encoded from the provided unicode codepoints. \n
  204. *@see UnicodeEncode()
  205. *@par Third-party framework compatibility
  206. *compatible with UnicodeEncode op of tensorflow
  207. *@par Restrictions:
  208. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  209. */
  210. REG_OP(UnicodeEncode)
  211. .INPUT(input_values, TensorType({DT_INT32}))
  212. .INPUT(input_splits, TensorType({DT_INT32, DT_INT64}))
  213. .OUTPUT(output, TensorType({DT_STRING}))
  214. .ATTR(errors, String, "replace")
  215. .ATTR(output_encoding, String, "UTF-8")
  216. .ATTR(replacement_char, Int, 65533)
  217. .OP_END_FACTORY_REG(UnicodeEncode)
  218. /**
  219. *@brief Split elements of input based on delimiter into a SparseTensor . \n
  220. *@par Inputs:
  221. include:
  222. *@li input:1-D. Strings to split.
  223. *@li delimiter:0-D. Delimiter characters (bytes), or empty string . \n
  224. *@par Attributes:
  225. * skip_empty:A bool. If True, skip the empty strings from the result . \n
  226. *@par Outputs:
  227. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  228. *@li values:A vector of strings corresponding to the splited values.
  229. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,
  230. *where the first value is N and the second value is the maximum number of tokens
  231. *in a single input entry . \n
  232. *@see StringSplit()
  233. *@par Third-party framework compatibility
  234. *compatible with StringSplit op of tensorflow
  235. *@par Restrictions:
  236. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  237. */
  238. REG_OP(StringSplit)
  239. .INPUT(input, TensorType({DT_STRING}))
  240. .INPUT(delimiter, TensorType({DT_STRING}))
  241. .OUTPUT(indices, TensorType({DT_INT64}))
  242. .OUTPUT(values, TensorType({DT_STRING}))
  243. .OUTPUT(shape, TensorType({DT_INT64}))
  244. .ATTR(skip_empty, Bool, true)
  245. .OP_END_FACTORY_REG(StringSplit)
  246. /**
  247. *@brief Replaces the match of pattern in input with rewrite. \n
  248. *@par Inputs:
  249. include:
  250. *@li input:A Tensor of type string. The text to be processed. \n
  251. *@par Attributes:
  252. *@li pattern:A string. The regular expression to match the input.
  253. *@li rewrite:A string. The rewrite to be applied to the matched expression.
  254. *@li replace_global:An optional bool. Defaults to True. If True, the replacement is global,
  255. otherwise the replacement is done only on the first match.
  256. *@par output:
  257. *@li output::A Tensor of type string.
  258. */
  259. REG_OP(StaticRegexReplace)
  260. .INPUT(input, TensorType({DT_STRING}))
  261. .OUTPUT(output, TensorType({DT_STRING}))
  262. .ATTR(pattern, String, "")
  263. .ATTR(rewrite, String, "")
  264. .ATTR(replace_global, Bool, true)
  265. .OP_END_FACTORY_REG(StaticRegexReplace)
  266. /**
  267. *@brief The input is a string tensor of any shape. The pattern is the
  268. *regular expression to be matched with every element of the input tensor.
  269. *The boolean values (True or False) of the output tensor indicate
  270. *if the input matches the regex pattern provided.
  271. *@par Inputs:
  272. include:
  273. *@li input:A Tensor of type string. The text to be processed. \n
  274. *@par Attributes:
  275. *@li pattern:A string. The regular expression to match the input.
  276. *@par output:
  277. *@li output::A bool tensor with the same shape as `input`.
  278. */
  279. REG_OP(StaticRegexFullMatch)
  280. .INPUT(input, TensorType({DT_STRING}))
  281. .OUTPUT(output, TensorType({DT_BOOL}))
  282. .ATTR(pattern, String, "")
  283. .OP_END_FACTORY_REG(StaticRegexFullMatch)
  284. /**
  285. *@brief A Tensor of type string. The input to be joined. \n
  286. *@par Inputs:
  287. include:
  288. *@li input:A Tensor of type string. The text to be processed.
  289. *@li segment_ids:A Tensor. Must be one of the following types: int32, int64.
  290. *A tensor whose shape is a prefix of data.shape. Negative segment ids are not supported.
  291. *@li num_segments:A Tensor. Must be one of the following types: int32, int64. A scalar.
  292. *@par Attributes:
  293. *@li separator:An optional string. Defaults to "". The separator to use when joining.
  294. *@par output:
  295. *@li output::A Tensor of type string..
  296. */
  297. REG_OP(UnsortedSegmentJoin)
  298. .INPUT(input, TensorType({DT_STRING}))
  299. .INPUT(segment_ids, TensorType({DT_INT32,DT_INT64}))
  300. .INPUT(num_segments, TensorType({DT_INT32,DT_INT64}))
  301. .OUTPUT(output, TensorType({DT_STRING}))
  302. .ATTR(separator, String, "")
  303. .OP_END_FACTORY_REG(UnsortedSegmentJoin)
  304. /**
  305. *@brief Inputs to TensorFlow operations are outputs of another TensorFlow operation.
  306. *This method is used to obtain a symbolic handle that represents the computation of the input.
  307. *@par Inputs:
  308. include:
  309. *@li input:A Tensor of type string. The text to be processed.
  310. *@par Attributes:
  311. *@li encoding:An optional string. Defaults to "".
  312. *@par output:
  313. *@li output::A Tensor of type string..
  314. */
  315. REG_OP(StringLower)
  316. .INPUT(input, TensorType({DT_STRING}))
  317. .OUTPUT(output, TensorType({DT_STRING}))
  318. .ATTR(encoding, String, "")
  319. .OP_END_FACTORY_REG(StringLower)
  320. /**
  321. *@brief Inputs to TensorFlow operations are outputs of another TensorFlow operation.
  322. *This method is used to obtain a symbolic handle that represents the computation of the input.
  323. *@par Inputs:
  324. include:
  325. *@li input:A Tensor of type string. The text to be processed.
  326. *@par Attributes:
  327. *@li encoding:An optional string. Defaults to "".
  328. *@par output:
  329. *@li output::A Tensor of type string..
  330. */
  331. REG_OP(StringUpper)
  332. .INPUT(input, TensorType({DT_STRING}))
  333. .OUTPUT(output, TensorType({DT_STRING}))
  334. .ATTR(encoding, String, "")
  335. .OP_END_FACTORY_REG(StringUpper)
  336. /**
  337. *@brief Split elements of source based on sep into a SparseTensor . \n
  338. *@par Inputs:
  339. include:
  340. *@li input:1-D. Strings to split.
  341. *@li sep:0-D string Tensor, the delimiter character . \n
  342. *@par Attributes:
  343. * maxsplit:An int. If maxsplit > 0, limit of the split of the result . \n
  344. *@par Outputs:
  345. *@li indices:A dense matrix of int64 representing the indices of the sparse tensor.
  346. *@li values:A vector of strings corresponding to the splited values.
  347. *@li shape:A length-2 vector of int64 representing the shape of the sparse tensor,
  348. *where the first value is N and the second value is the maximum number of tokens
  349. *in a single input entry . \n
  350. *@see StringSplitV2()
  351. *@par Third-party framework compatibility
  352. *compatible with StringSplitV2 op of tensorflow
  353. *@par Restrictions:
  354. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  355. */
  356. REG_OP(StringSplitV2)
  357. .INPUT(input, TensorType({DT_STRING}))
  358. .INPUT(sep, TensorType({DT_STRING}))
  359. .OUTPUT(indices, TensorType({DT_INT64}))
  360. .OUTPUT(values, TensorType({DT_STRING}))
  361. .OUTPUT(shape, TensorType({DT_INT64}))
  362. .ATTR(maxsplit, Int, -1)
  363. .OP_END_FACTORY_REG(StringSplitV2)
  364. /**
  365. *@brief Determine the script codes of a given tensor of Unicode integer code points . \n
  366. *@par Inputs:
  367. include:
  368. *x:A Tensor of int32 Unicode code points . \n
  369. *@par Outputs:
  370. *y:A Tensor of int32 script codes corresponding to each input code point . \n
  371. *@attention Constraints:
  372. *This operation converts Unicode code points to script codes corresponding to
  373. *each code point. Script codes correspond to International Components for
  374. *Unicode (ICU) UScriptCode values.
  375. *See http://icu-project.org/apiref/icu4c/uscript_8h.html.
  376. *Returns -1 (USCRIPT_INVALID_CODE) for invalid codepoints.
  377. *Output shape will match input shape . \n
  378. *@see UnicodeScript()
  379. *@par Third-party framework compatibility
  380. *compatible with UnicodeScript op of tensorflow
  381. *@par Restrictions:
  382. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  383. */
  384. REG_OP(UnicodeScript)
  385. .INPUT(x, TensorType({DT_INT32}))
  386. .OUTPUT(y, TensorType({DT_INT32}))
  387. .OP_END_FACTORY_REG(UnicodeScript)
  388. /**
  389. *@brief Return substrings from Tensor of strings . \n
  390. *@par Inputs:
  391. include:
  392. *@li input:Tensor of strings.
  393. *@li pos:Scalar defining the position of first character in each substring.
  394. *@li len:Scalar defining the number of characters to include in each substring . \n
  395. *@par Outputs:
  396. *output:Tensor of substrings . \n
  397. *@attention Constraints:
  398. *The hash function is deterministic on the content of the string within
  399. *the process and will never change. However, it is not suitable for
  400. *cryptography. This function may be used when CPU time is scarce and
  401. *inputs are trusted or unimportant. There is a risk of adversaries
  402. *constructing inputs that all hash to the same bucket.
  403. *To prevent this problem, use a strong hash function with
  404. *tf.string_to_hash_bucket_strong . \n
  405. *@see Substr()
  406. *@par Third-party framework compatibility
  407. *compatible with Substr op of tensorflow
  408. *@par Restrictions:
  409. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  410. */
  411. REG_OP(Substr)
  412. .INPUT(input, TensorType({DT_STRING}))
  413. .INPUT(pos, TensorType({DT_INT32, DT_INT64}))
  414. .INPUT(len, TensorType({DT_INT32, DT_INT64}))
  415. .OUTPUT(output, TensorType({DT_STRING}))
  416. .OP_END_FACTORY_REG(Substr)
  417. /**
  418. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  419. *@par Inputs:
  420. include:
  421. *string_tensor:The strings to assign a hash bucket . \n
  422. *@par Outputs:
  423. *y:A Tensor of the same shape as the input x . \n
  424. *@attention Constraints:
  425. *The hash function is deterministic on the content of the string within
  426. *the process and will never change. However, it is not suitable for cryptography.
  427. *This function may be used when CPU time is scarce and inputs are trusted or
  428. *unimportant. There is a risk of adversaries constructing inputs that all hash
  429. *to the same bucket. To prevent this problem, use a strong hash function with
  430. *tf.string_to_hash_bucket_strong . \n
  431. *@see StringToHashBucketFast()
  432. *@par Third-party framework compatibility
  433. *compatible with StringToHashBucketFast op of tensorflow
  434. *@par Restrictions:
  435. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  436. */
  437. REG_OP(StringToHashBucketFast)
  438. .INPUT(x, TensorType({DT_STRING}))
  439. .OUTPUT(y, TensorType({DT_INT64}))
  440. .ATTR(num_buckets, Int, 1)
  441. .OP_END_FACTORY_REG(StringToHashBucketFast)
  442. /**
  443. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  444. *@par Inputs:
  445. include:
  446. *x:The strings to assign a hash bucket . \n
  447. *@par Attributes:
  448. *num_buckets:The number of buckets . \n
  449. *@par Outputs:
  450. *y:A Tensor of the same shape as the input x . \n
  451. *@attention Constraints:
  452. *@li A strong hash is important when inputs may be malicious, e.g. URLs with
  453. *additional components. Adversaries could try to make their inputs hash to
  454. *the same bucket for a denial-of-service attack or to skew the results.
  455. *A strong hash can be used to make it difficult to find inputs with a skewed
  456. * hash value distribution over buckets. This requires that the hash function\
  457. *is seeded by a high-entropy (random) "key" unknown to the adversary.
  458. *@li The additional robustness comes at a cost of roughly 4x higher
  459. *compute time than tf.string_to_hash_bucket_fast . \n
  460. *@see StringToHashBucketStrong()
  461. *@par Third-party framework compatibility
  462. *compatible with StringToHashBucketStrong op of tensorflow
  463. *@par Restrictions:
  464. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  465. */
  466. REG_OP(StringToHashBucketStrong)
  467. .INPUT(x, TensorType({DT_STRING}))
  468. .OUTPUT(y, TensorType({DT_INT64}))
  469. .ATTR(num_buckets, Int, 1)
  470. .REQUIRED_ATTR(key, ListInt)
  471. .OP_END_FACTORY_REG(StringToHashBucketStrong)
  472. /**
  473. *@brief Converts each string in the input Tensor to its hash mod by a number of buckets . \n
  474. *@par Inputs:
  475. include:
  476. *string_tensor:The strings to assign a hash bucket . \n
  477. *@par Attributes:
  478. *num_buckets:The number of buckets . \n
  479. *@par Outputs:
  480. *y:A Tensor of the same shape as the input string_tensor . \n
  481. *@see StringToHashBucket()
  482. *@par Third-party framework compatibility
  483. *compatible with StringToHashBucket op of tensorflow
  484. *@par Restrictions:
  485. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  486. */
  487. REG_OP(StringToHashBucket)
  488. .INPUT(string_tensor, TensorType({DT_STRING}))
  489. .OUTPUT(y, TensorType({DT_INT64}))
  490. .ATTR(num_buckets, Int, 1)
  491. .OP_END_FACTORY_REG(StringToHashBucket)
  492. /**
  493. *@brief Strip leading and trailing whitespaces from the Tensor . \n
  494. *@par Inputs:
  495. include:
  496. *x:A string Tensor of any shape . \n
  497. *@par Outputs:
  498. *y:A string Tensor of the same shape as the input . \n
  499. *@see StringStrip()
  500. *@par Third-party framework compatibility
  501. *compatible with StringStrip op of tensorflow
  502. *@par Restrictions:
  503. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  504. */
  505. REG_OP(StringStrip)
  506. .INPUT(x, TensorType({DT_STRING}))
  507. .OUTPUT(y, TensorType({DT_STRING}))
  508. .OP_END_FACTORY_REG(StringStrip)
  509. /**
  510. *@brief Computes the length of each string given in the input tensor . \n
  511. *@par Inputs:
  512. include:
  513. *x:The string for which to compute the length . \n
  514. *@par Attributes:
  515. *unit:The unit that is counted to compute string length.
  516. *One of: "BYTE" (for the number of bytes in each string) or
  517. *"UTF8_CHAR" (for the number of UTF-8 encoded Unicode code points in each string).
  518. *Results are undefined if unit=UTF8_CHAR and the input strings do not contain
  519. *structurally valid UTF-8 . \n
  520. *@par Outputs:
  521. *y:Integer tensor that has the same shape as input.
  522. *The output contains the element-wise string lengths of input . \n
  523. *@see StringLength()
  524. *@par Third-party framework compatibility
  525. *compatible with StringLength op of tensorflow
  526. *@par Restrictions:
  527. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  528. */
  529. REG_OP(StringLength)
  530. .INPUT(x, TensorType({DT_STRING}))
  531. .OUTPUT(y, TensorType({DT_INT32}))
  532. .ATTR(unit, String, "BYTE")
  533. .OP_END_FACTORY_REG(StringLength)
  534. /**
  535. *@brief Joins the strings in the given list of string tensors into one tensor . \n
  536. *@par Inputs:
  537. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  538. *which is applied to every element of the input tensor. The boolean values
  539. *(True or False) of the output tensor indicate if the input matches the regex
  540. *pattern provided. The pattern follows the re2 syntax
  541. *(https://github.com/google/re2/wiki/Syntax).:
  542. include:
  543. *x:A list of string tensors. The tensors must all have the same shape,
  544. *or be scalars. Scalars may be mixed in; these will be broadcast to the shape
  545. *of non-scalar inputs . It's a dynamic input. \n
  546. *@par Attributes:
  547. *@li N:The length of input x.
  548. *@li separator:string, an optional join separator . \n
  549. *@par Outputs:
  550. *y:The output tensor . \n
  551. *@see StringJoin()
  552. *@par Third-party framework compatibility
  553. *compatible with StringJoin op of tensorflow
  554. *@par Restrictions:
  555. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  556. */
  557. REG_OP(StringJoin)
  558. .DYNAMIC_INPUT(x, TensorType({DT_STRING}))
  559. .OUTPUT(y, TensorType({DT_STRING}))
  560. .REQUIRED_ATTR(N, Int)
  561. .ATTR(separator, String, "")
  562. .OP_END_FACTORY_REG(StringJoin)
  563. /**
  564. *@brief Formats a string template using a list of tensors . \n
  565. *@par Inputs:
  566. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  567. *which is applied to every element of the input tensor.
  568. *The boolean values (True or False) of the output tensor indicate if the input
  569. *matches the regex pattern provided. The pattern follows the re2 syntax
  570. *(https://github.com/google/re2/wiki/Syntax).:
  571. include:
  572. *x:The tensors to format into the placeholder string . It's a dynamic input. \n
  573. *@par Attributes:
  574. *@li template:A string, the template to format tensor summaries into.
  575. *@li placeholder:A string, at each placeholder in the template a subsequent tensor summary will be inserted.
  576. *@li summarize:When formatting the tensor summaries print the first and last summarize entries of each tensor dimension . \n
  577. *@par Outputs:
  578. *y:The resulting string scalar . \n
  579. *@see StringFormat()
  580. *@par Third-party framework compatibility
  581. * compatible with StringFormat op of tensorflow
  582. *@par Restrictions:
  583. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  584. */
  585. REG_OP(StringFormat)
  586. .DYNAMIC_INPUT(x, TensorType({DT_INT8, DT_UINT8, DT_INT16, DT_UINT16, \
  587. DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_STRING, DT_FLOAT16, \
  588. DT_FLOAT, DT_DOUBLE, DT_BOOL}))
  589. .OUTPUT(y, TensorType({DT_STRING}))
  590. .ATTR(template, String, "%s")
  591. .ATTR(placeholder, String, "%s")
  592. .ATTR(summarize, Int, 3)
  593. .OP_END_FACTORY_REG(StringFormat)
  594. /**
  595. *@brief Check if the input matches the regex pattern . \n
  596. *@par Inputs:
  597. *The input is a string tensor of any shape. The pattern is a scalar string tensor
  598. *which is applied to every element of the input tensor. The boolean values
  599. *(True or False) of the output tensor indicate if the input matches the regex
  600. *pattern provided. The pattern follows the re2 syntax
  601. *(https://github.com/google/re2/wiki/Syntax).:
  602. include:
  603. *@li x:A string tensor of the text to be processed.
  604. *@li pattern:A scalar string tensor containing the regular expression to match the input . \n
  605. *@par Outputs:
  606. *y:A bool tensor with the same shape as input . \n
  607. *@see RegexFullMatch()
  608. *@par Third-party framework compatibility
  609. *compatible with RegexFullMatch op of tensorflow
  610. *@par Restrictions:
  611. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  612. */
  613. REG_OP(RegexFullMatch)
  614. .INPUT(x, TensorType({DT_STRING}))
  615. .INPUT(pattern, TensorType({DT_STRING}))
  616. .OUTPUT(y, TensorType({DT_BOOL}))
  617. .OP_END_FACTORY_REG(RegexFullMatch)
  618. /**
  619. *@brief Replaces matches of the pattern regular expression in input with the
  620. *replacement string provided in rewrite . \n
  621. *@par Inputs:
  622. *It follows the re2 syntax (https://github.com/google/re2/wiki/Syntax).:
  623. include:
  624. *@li x:The text to be processed.
  625. *@li pattern:The regular expression to be matched in the input strings.
  626. *@li rewrite:The rewrite string to be substituted for the pattern expression
  627. *where it is matched in the input strings . \n
  628. *@par Attributes:
  629. *replace_global:If True, the replacement is global
  630. *(that is, all matches of the pattern regular expression in each input string
  631. *are rewritten), otherwise the rewrite substitution is only made for the first
  632. * pattern match . \n
  633. *@par Outputs:
  634. *y:The text after applying pattern match and rewrite substitution . \n
  635. *@see RegexReplace()
  636. *@par Third-party framework compatibility
  637. *compatible with RegexReplace op of tensorflow
  638. *@par Restrictions:
  639. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  640. */
  641. REG_OP(RegexReplace)
  642. .INPUT(x, TensorType({DT_STRING}))
  643. .INPUT(pattern, TensorType({DT_STRING}))
  644. .INPUT(rewrite, TensorType({DT_STRING}))
  645. .OUTPUT(y, TensorType({DT_STRING}))
  646. .ATTR(replace_global, Bool, true)
  647. .OP_END_FACTORY_REG(RegexReplace)
  648. /**
  649. *@brief Converts each entry in the given tensor to strings . \n
  650. *@par Inputs:
  651. *Supports many numeric types and boolean.:
  652. include:
  653. *x:A tensor can be trans to string . \n
  654. *@par Attributes:
  655. *@li precision:The post-decimal precision to use for floating point numbers.
  656. *Only used if precision > -1.
  657. *@li scientific:Use scientific notation for floating point numbers.
  658. *@li shortest:Use shortest representation (either scientific or standard)
  659. *for floating point numbers..
  660. *@li width:Pad pre-decimal numbers to this width. Applies to both floating
  661. *point and integer numbers. Only used if width > -1.
  662. *@li fill:The value to pad if width > -1. If empty, pads with spaces.
  663. *Another typical value is '0'. String cannot be longer than 1 character . \n
  664. *@par Outputs:
  665. *y:The output tensor . \n
  666. *@see AsString()
  667. *@par Third-party framework compatibility
  668. *compatible with AsString op of tensorflow
  669. *@par Restrictions:
  670. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  671. */
  672. REG_OP(AsString)
  673. .INPUT(x, TensorType({DT_INT8, DT_INT16, DT_INT32, DT_INT64, DT_FLOAT, \
  674. DT_DOUBLE, DT_BOOL, DT_COMPLEX64, DT_COMPLEX128}))
  675. .OUTPUT(y, TensorType({DT_STRING}))
  676. .ATTR(precision, Int, -1)
  677. .ATTR(scientific, Bool, false)
  678. .ATTR(shortest, Bool, false)
  679. .ATTR(width, Int, -1)
  680. .ATTR(fill, String, "")
  681. .OP_END_FACTORY_REG(AsString)
  682. /**
  683. *@brief Encode strings into web-safe base64 format . \n
  684. *@par Inputs:
  685. *Input may or may not have padding at the end. See EncodeBase64 for padding.
  686. *Web-safe means that input must use - and _ instead of + and /.:
  687. include:
  688. *x:Strings to be encoded . \n
  689. *@par Attributes:
  690. *pad:Bool whether padding is applied at the ends . \n
  691. *@par Outputs:
  692. *y:Input strings encoded in base64 . \n
  693. *@attention Constraints:
  694. *Refer to the following article for more information on base64 format:
  695. *en.wikipedia.org/wiki/Base64. Base64 strings may have padding with '='
  696. *at the end so that the encoded has length multiple of 4.
  697. *See Padding section of the link above. Web-safe means that the encoder
  698. *uses - and _ instead of + and / . \n
  699. *@see EncodeBase64()
  700. *@par Third-party framework compatibility
  701. *compatible with EncodeBase64 op of tensorflow
  702. *@par Restrictions:
  703. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  704. */
  705. REG_OP(EncodeBase64)
  706. .INPUT(x, TensorType({DT_STRING}))
  707. .OUTPUT(y, TensorType({DT_STRING}))
  708. .ATTR(pad, Bool, false)
  709. .OP_END_FACTORY_REG(EncodeBase64)
  710. /**
  711. *@brief Decode web-safe base64-encoded strings . \n
  712. *@par Inputs:
  713. *Input may or may not have padding at the end. See EncodeBase64 for padding.
  714. *Web-safe means that input must use - and _ instead of + and /.:
  715. include:
  716. *x:Base64 strings to decode . \n
  717. *@par Outputs:
  718. *y:Decoded strings . \n
  719. *@see DecodeBase64()
  720. *@par Third-party framework compatibility
  721. *compatible with DecodeBase64 op of tensorflow
  722. *@par Restrictions:
  723. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  724. */
  725. REG_OP(DecodeBase64)
  726. .INPUT(x, TensorType({DT_STRING}))
  727. .OUTPUT(y, TensorType({DT_STRING}))
  728. .OP_END_FACTORY_REG(DecodeBase64)
  729. } // namespace ge
  730. #endif // OPS_BUILT_IN_OP_PROTO_INC_STRING_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示