You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

ctc_ops.h 9.3 kB

3 years ago
3 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223
  1. /**
  2. * Copyright 2019 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. /*!
  17. * \file ctc_ops.h
  18. * \brief
  19. */
  20. #ifndef OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_
  21. #define OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_
  22. #include "graph/operator.h"
  23. #include "graph/operator_reg.h"
  24. namespace ge {
  25. /**
  26. *@brief Calculates the CTC Loss (log probability) for each batch entry.
  27. Also calculates the gradient. \n
  28. *@par Inputs:
  29. *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
  30. *@li labels_indices: The indices of a `SparseTensor<int32, 2>`.
  31. `labels_indices(i, :) == [b, t]` means `labels_values(i)` stores the id for
  32. `(batch b, time t)`.
  33. *@li labels_values: The values (labels) associated with the given batch and time.
  34. *@li sequence_length: A vector containing sequence lengths (batch). \n
  35. *@par Outputs:
  36. *@li loss: A vector (batch) containing log-probabilities.
  37. *@li gradient: The gradient of `loss`. 3-D, shape: `(max_time x
  38. batch_size x num_classes)`. \n
  39. *@par Attributes:
  40. *@li preprocess_collapse_repeated: Scalar, if true then repeated labels are collapsed prior to
  41. the CTC calculation.If not specified, defaults to false
  42. *@li ctc_merge_repeated: Scalar. If set to false, *during* CTC calculation
  43. repeated non-blank labels will not be merged and are interpreted as
  44. individual labels. This is a simplified version of CTC.
  45. If not specified, defaults to true. \n
  46. *@par Third-party framework compatibility
  47. * Compatible with TensorFlow CTCLoss operator.
  48. */
  49. REG_OP(CTCLoss)
  50. .INPUT(inputs, TensorType({DT_FLOAT, DT_DOUBLE}))
  51. .INPUT(labels_indices, TensorType({DT_INT64}))
  52. .INPUT(labels_values, TensorType({DT_INT32}))
  53. .INPUT(sequence_length, TensorType({DT_INT32}))
  54. .OUTPUT(loss, TensorType({DT_FLOAT, DT_DOUBLE}))
  55. .OUTPUT(gradient, TensorType({DT_FLOAT, DT_DOUBLE}))
  56. .ATTR(preprocess_collapse_repeated, Bool, false)
  57. .ATTR(ctc_merge_repeated, Bool, true)
  58. .ATTR(ignore_longer_outputs_than_inputs, Bool, false)
  59. .OP_END_FACTORY_REG(CTCLoss)
  60. /**
  61. *@brief Performs greedy decoding on the logits given in inputs. \n
  62. *@par Inputs:
  63. *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
  64. *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n
  65. *@par Attributes:
  66. *@li merge_repeated: If True, merge repeated classes in output. \n
  67. *@par Outputs:
  68. *@li decoded_indices: Indices matrix, size `(total_decoded_outputs x 2)`,
  69. of a `SparseTensor<int64, 2>`. The rows store: [batch, time].
  70. *@li decoded_values: Values vector, size: `(total_decoded_outputs)`,
  71. of a `SparseTensor<int64, 2>`. The vector stores the decoded classes.
  72. *@li decoded_shape: Shape vector, size `(2)`, of the decoded SparseTensor.
  73. Values are: `[batch_size, max_decoded_length]`.
  74. *@li log_probability: Matrix, size `(batch_size x 1)`, containing sequence
  75. log-probabilities. \n
  76. *@par Third-party framework compatibility
  77. * Compatible with TensorFlow CTCGreedyDecoder operator.
  78. */
  79. REG_OP(CTCGreedyDecoder)
  80. .INPUT(inputs, TensorType({DT_FLOAT, DT_DOUBLE}))
  81. .INPUT(sequence_length, TensorType({DT_INT32}))
  82. .ATTR(merge_repeated, Bool, false)
  83. .OUTPUT(decoded_indices, TensorType({DT_INT64}))
  84. .OUTPUT(decoded_values, TensorType({DT_INT64}))
  85. .OUTPUT(decoded_shape, TensorType({DT_INT64}))
  86. .OUTPUT(log_probability, TensorType({DT_FLOAT, DT_DOUBLE}))
  87. .OP_END_FACTORY_REG(CTCGreedyDecoder)
  88. /**
  89. *@brief Performs beam search decoding on the logits given in input. \n
  90. *@par Inputs:
  91. *@li inputs: 3-D, shape: `(max_time x batch_size x num_classes)`, the logits.
  92. *@li sequence_length: A vector containing sequence lengths, size `(batch_size)`. \n
  93. *@par Attributes:
  94. *@li merge_repeated: If True, merge repeated classes in output. \n
  95. *@par Outputs:
  96. *@li decoded_indices: A list (length: top_paths) of indices matrices. Matrix j,
  97. size `(total_decoded_outputs[j] x 2)`, has indices of a
  98. `SparseTensor<int64, 2>`. The rows store: [batch, time].
  99. *@li decoded_values: A list (length: top_paths) of values vectors. Vector j,
  100. size `(length total_decoded_outputs[j])`, has the values of a
  101. `SparseTensor<int64, 2>`. The vector stores the decoded classes for beam j.
  102. *@li decoded_shape: A list (length: top_paths) of shape vector. Vector j,
  103. size `(2)`, stores the shape of the decoded `SparseTensor[j]`.
  104. Its values are: `[batch_size, max_decoded_length[j]]`.
  105. *@li log_probability: A matrix, shaped: `(batch_size x top_paths)`. The
  106. sequence log-probabilities. \n
  107. *@par Third-party framework compatibility
  108. * Compatible with TensorFlow CTCBeamSearchDecoder operator.
  109. */
  110. REG_OP(CTCBeamSearchDecoder)
  111. .INPUT(inputs, TensorType({DT_FLOAT, DT_DOUBLE}))
  112. .INPUT(sequence_length, TensorType({DT_INT32}))
  113. .REQUIRED_ATTR(beam_width, Int)
  114. .REQUIRED_ATTR(top_paths, Int)
  115. .ATTR(merge_repeated, Bool, true)
  116. .DYNAMIC_OUTPUT(decoded_indices, TensorType({DT_INT64}))
  117. .DYNAMIC_OUTPUT(decoded_values, TensorType({DT_INT64}))
  118. .DYNAMIC_OUTPUT(decoded_shape, TensorType({DT_INT64}))
  119. .OUTPUT(log_probability, TensorType({DT_FLOAT, DT_DOUBLE}))
  120. .OP_END_FACTORY_REG(CTCBeamSearchDecoder)
  121. /**
  122. *@brief The Connectionist Temporal Classification loss.
  123. *@par Inputs:
  124. *@li log_probs: Tensor of size (T, N, C), where T =input length, N =batch size,
  125. and C = number of classes (including blank).
  126. It represent the logarithmized probabilities of the outputs.
  127. *@li targets: Tensor of size (N, S), where S= max target length.
  128. It represent the target sequences.
  129. *@li input_lengths: Tuple or tensor of size (N). It represent the lengths of the inputs.
  130. *@li target_lengths: Tuple or tensor of size (N). It represent lengths of the targets.
  131. *@par Outputs:
  132. *@li neg_log_likelihood: A loss value which is differentiable with respect to each input node.
  133. *@li log_alpha: The probability of possible trace of input to target.
  134. *@par Attributes:
  135. *@li blank : Blank label. Default 0.
  136. *@li reduction: Specifies the reduction to apply to the output. Default: 'mean'.
  137. *@li zero_infinity : Whether to zero infinite losses and the associated gradients.
  138. *@par Third-party framework compatibility
  139. * Compatible with Pytorch CTCLoss operator.
  140. *@par Restrictions:
  141. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  142. */
  143. REG_OP(CTCLossV2)
  144. .INPUT(log_probs, TensorType({DT_FLOAT, DT_DOUBLE}))
  145. .INPUT(targets, TensorType({DT_INT32, DT_INT64}))
  146. .INPUT(input_lengths, TensorType({DT_INT32, DT_INT64}))
  147. .INPUT(target_lengths, TensorType({DT_INT32, DT_INT64}))
  148. .OUTPUT(neg_log_likelihood, TensorType({DT_FLOAT, DT_DOUBLE}))
  149. .OUTPUT(log_alpha, TensorType({DT_FLOAT, DT_DOUBLE}))
  150. .ATTR(blank, Int, 0)
  151. .ATTR(reduction, String, "mean")
  152. .ATTR(zero_infinity, Bool, false)
  153. .OP_END_FACTORY_REG(CTCLossV2)
  154. /**
  155. *@brief The Connectionist Temporal Classification loss grad.
  156. *@par Inputs:
  157. *@li grad_out: Gradient renewal coefficient. Tensor of size (N), where N = batch size.
  158. *@li log_probs: Tensor of size (T, N, C), where T =input length, N =batch size,
  159. and C = number of classes (including blank).
  160. It represent the logarithmized probabilities of the outputs.
  161. *@li targets: Tensor of size (N, S), where S= max target length.
  162. It represent the target sequences.
  163. *@li input_lengths: Tuple or tensor of size (N). It represent the lengths of the inputs.
  164. *@li target_lengths: Tuple or tensor of size (N). It represent lengths of the targets.
  165. *@li neg_log_likelihood: A loss value which is differentiable with respect to each input node.
  166. *@li log_alpha: The probability of possible trace of input to target.
  167. *@par Outputs:
  168. *@li grad: Tensor of size (T, N, C), The grad of Connectionist Temporal Classification loss.
  169. *@par Attributes:
  170. *@li blank : Blank label. Default 0.
  171. *@li reduction: Specifies the reduction to apply to the output. Default: 'mean'.
  172. *@li zero_infinity : Whether to zero infinite losses and the associated gradients.
  173. *@par Third-party framework compatibility
  174. * Compatible with Pytorch CTCLoss operator.
  175. *@par Restrictions:
  176. *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
  177. */
  178. REG_OP(CTCLossV2Grad)
  179. .INPUT(grad_out, TensorType({DT_FLOAT, DT_DOUBLE}))
  180. .INPUT(log_probs, TensorType({DT_FLOAT, DT_DOUBLE}))
  181. .INPUT(targets, TensorType({DT_INT32, DT_INT64}))
  182. .INPUT(input_lengths, TensorType({DT_INT32, DT_INT64}))
  183. .INPUT(target_lengths, TensorType({DT_INT32, DT_INT64}))
  184. .INPUT(neg_log_likelihood, TensorType({DT_FLOAT, DT_DOUBLE}))
  185. .INPUT(log_alpha, TensorType({DT_FLOAT, DT_DOUBLE}))
  186. .OUTPUT(grad, TensorType({DT_FLOAT, DT_DOUBLE}))
  187. .ATTR(blank, Int, 0)
  188. .ATTR(reduction, String, "mean")
  189. .ATTR(zero_infinity, Bool, false)
  190. .OP_END_FACTORY_REG(CTCLossV2Grad)
  191. } // namespace ge
  192. #endif // OPS_BUILT_IN_OP_PROTO_INC_CTC_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示