You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

audio_ops.h 4.5 kB

5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137
  1. /**
  2. * Copyright 2019-2020 Huawei Technologies Co., Ltd
  3. *
  4. * Licensed under the Apache License, Version 2.0 (the "License");
  5. * you may not use this file except in compliance with the License.
  6. * You may obtain a copy of the License at
  7. *
  8. * http://www.apache.org/licenses/LICENSE-2.0
  9. *
  10. * Unless required by applicable law or agreed to in writing, software
  11. * distributed under the License is distributed on an "AS IS" BASIS,
  12. * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13. * See the License for the specific language governing permissions and
  14. * limitations under the License.
  15. */
  16. #ifndef GE_OP_AUDIO_OPS_H_
  17. #define GE_OP_AUDIO_OPS_H_
  18. #include "graph/operator_reg.h"
  19. namespace ge {
  20. /**
  21. *@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of taking the DCT-II of a log-magnitude mel-scale spectrogram.
  22. *@par Inputs:
  23. *The input spectrogram must be three-dimensional tensor, sample_rate must be a scalar. Inputs include: \n
  24. * @li spectrogram:3D float tensor of mel-frequency cepstral coefficient.
  25. * @li sample_rate:Mel-Frequency Cepstral Coefficient (MFCC) calculation sample rate.
  26. *@par Attributes:
  27. *@li upper_frequency_limit:Upper limit of the mfcc calculation frequency.
  28. *@li lower_frequency_limit:Lower limit of the mfcc calculation frequency.
  29. *@li filterbank_channel_count:Count of the channel filterbank.
  30. *@li dct_coefficient_count:Count of the dct coefficient.
  31. *@par Outputs:
  32. *y:A float32 Tensor of the MFCCs of spectrogram.
  33. *@attention Constraints:\n
  34. *-The implementation for Mfcc on Ascend uses AI CPU, with bad performance.\n
  35. *@par Quantization supported or not
  36. *Not supported
  37. *@par Quantized inference supported or not
  38. *Supported
  39. *@par L2 convergence supported or not
  40. *@par Multiple batches supported or not
  41. */
  42. REG_OP(Mfcc)
  43. .INPUT(spectrogram, TensorType({DT_FLOAT}))
  44. .INPUT(sample_rate, TensorType({DT_INT32}))
  45. .OUTPUT(y, TensorType({DT_FLOAT}))
  46. .ATTR(upper_frequency_limit, Float, 4000)
  47. .ATTR(lower_frequency_limit, Float, 20)
  48. .ATTR(filterbank_channel_count, Int, 40)
  49. .ATTR(dct_coefficient_count, Int, 13)
  50. .OP_END_FACTORY_REG(Mfcc)
  51. /**
  52. *@brief Decode and generate spectrogram using wav float tensor.
  53. *@par Inputs:
  54. *The input x must be two-dimensional matrices. Inputs include: \n
  55. * x:float tensor of the wav audio contents. contains length and channel
  56. *@par Attributes:
  57. *@li window_size:Size of the spectrogram window.
  58. *@li stride:Size of the spectrogram stride.
  59. *@li magnitude_squared:If true, using magnitude squared.
  60. *@par Outputs:
  61. *spectrogram:3-D float Tensor with the image contents.
  62. *@attention Constraints:\n
  63. *-The implementation for AudioSpectrogram on Ascend uses AI CPU, with bad performance.\n
  64. *@par Quantization supported or not
  65. *Not supported
  66. *@par Quantized inference supported or not
  67. *Supported
  68. *@par L2 convergence supported or not
  69. *@par Multiple batches supported or not
  70. */
  71. REG_OP(AudioSpectrogram)
  72. .INPUT(x, TensorType({DT_FLOAT}))
  73. .OUTPUT(spectrogram, TensorType({DT_FLOAT}))
  74. .REQUIRED_ATTR(window_size, Int)
  75. .REQUIRED_ATTR(stride, Int)
  76. .ATTR(magnitude_squared, Bool, false)
  77. .OP_END_FACTORY_REG(AudioSpectrogram)
  78. /**
  79. *@brief Decode a 16-bit WAV file into a float tensor.
  80. *@par Inputs:
  81. *The input contents must be string tensor. Inputs include: \n
  82. * @li contents:A Tensor of type string. The WAV-encoded audio, usually from a file.
  83. *@par Attributes:
  84. *@li desired_channels:An optional int. Defaults to -1. Number of sample channels wanted.
  85. *@li desired_samples:An optional int. Defaults to -1. Length of audio requested.
  86. *@par Outputs:
  87. *@li *audio:A Tensor of type float32.
  88. *@li *sample_rate:A Tensor of type int32.
  89. *@attention Constraints: \n
  90. *-The implementation for DecodeWav on Ascend uses AI CPU, with bad performance. \n
  91. *@par Quantization supported or not
  92. *Not supported
  93. *@par Quantized inference supported or not
  94. *Supported
  95. *@par L2 convergence supported or not
  96. *@par Multiple batches supported or not
  97. */
  98. REG_OP(DecodeWav)
  99. .INPUT(contents, TensorType({DT_STRING}))
  100. .OUTPUT(audio, TensorType({DT_FLOAT}))
  101. .OUTPUT(sample_rate, TensorType({DT_INT32}))
  102. .ATTR(desired_channels, Int, -1)
  103. .ATTR(desired_samples, Int, -1)
  104. .OP_END_FACTORY_REG(DecodeWav)
  105. REG_OP(EncodeWav)
  106. .INPUT(audio, TensorType({DT_FLOAT}))
  107. .INPUT(sample_rate, TensorType({DT_INT32}))
  108. .OUTPUT(contents, TensorType({DT_STRING}))
  109. .OP_END_FACTORY_REG(EncodeWav)
  110. } // namespace ge
  111. #endif // GE_OP_AUDIO_OPS_H_

图引擎模块(GE)是MindSpore的一个子模块,其代码由C++实现,位于前端模块ME和底层硬件之间,起到承接作用。图引擎模块以ME下发的图作为输入,然后进行一系列的深度图优化操作,最后输出一张可以在底层硬件上高效运行的图。GE针对昇腾AI处理器的硬件结构特点,做了特定的优化工作,以此来充分发挥出昇腾AI处理器的强大算力。在进行模型训练/推理时,GE会被自动调用而用户并不感知。GE主要由GE API和GE Core两部分组成,详细的架构图如下所示