hummingbird
/
graphengine

/**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#ifndef GE_OP_AUDIO_OPS_H_
#define GE_OP_AUDIO_OPS_H_

#include "graph/operator_reg.h"

namespace ge {

/**
*@brief Mel-Frequency Cepstral Coefficient (MFCC) calculation consists of taking the DCT-II of a log-magnitude mel-scale spectrogram.

*@par Inputs:
*The input spectrogram must be three-dimensional tensor, sample_rate must be a scalar. Inputs include: \n
* @li spectrogram:3D float tensor of mel-frequency cepstral coefficient.
* @li sample_rate:Mel-Frequency Cepstral Coefficient (MFCC) calculation sample rate.

*@par Attributes:
*@li upper_frequency_limit:Upper limit of the mfcc calculation frequency.
*@li lower_frequency_limit:Lower limit of the mfcc calculation frequency.
*@li filterbank_channel_count:Count of the channel filterbank.
*@li dct_coefficient_count:Count of the dct coefficient.

*@par Outputs:
*y:A float32 Tensor of the MFCCs of spectrogram.

*@attention Constraints:\n
*-The implementation for Mfcc on Ascend uses AI CPU, with bad performance.\n

*@par Quantization supported or not
*Not supported
*@par Quantized inference supported or not
*Supported
*@par L2 convergence supported or not
*@par Multiple batches supported or not
*/

REG_OP(Mfcc)
    .INPUT(spectrogram, TensorType({DT_FLOAT}))
    .INPUT(sample_rate, TensorType({DT_INT32}))
    .OUTPUT(y, TensorType({DT_FLOAT}))
    .ATTR(upper_frequency_limit, Float, 4000)
    .ATTR(lower_frequency_limit, Float, 20)
    .ATTR(filterbank_channel_count, Int, 40)
    .ATTR(dct_coefficient_count, Int, 13)
    .OP_END_FACTORY_REG(Mfcc)

/**
*@brief Decode and generate spectrogram using wav float tensor.

*@par Inputs:
*The input x must be two-dimensional matrices. Inputs include: \n
* x:float tensor of the wav audio contents. contains length and channel

*@par Attributes:
*@li window_size:Size of the spectrogram window.
*@li stride:Size of the spectrogram stride.
*@li magnitude_squared:If true, using magnitude squared.

*@par Outputs:
*spectrogram:3-D float Tensor with the image contents.

*@attention Constraints:\n
*-The implementation for AudioSpectrogram on Ascend uses AI CPU, with bad performance.\n

*@par Quantization supported or not
*Not supported
*@par Quantized inference supported or not
*Supported
*@par L2 convergence supported or not
*@par Multiple batches supported or not
*/

REG_OP(AudioSpectrogram)
    .INPUT(x, TensorType({DT_FLOAT}))
    .OUTPUT(spectrogram, TensorType({DT_FLOAT}))
    .REQUIRED_ATTR(window_size, Int)
    .REQUIRED_ATTR(stride, Int)
    .ATTR(magnitude_squared, Bool, false)
    .OP_END_FACTORY_REG(AudioSpectrogram)

/**
*@brief Decode a 16-bit WAV file into a float tensor.

*@par Inputs:
*The input contents must be string tensor. Inputs include: \n
* @li contents:A Tensor of type string. The WAV-encoded audio, usually from a file.

*@par Attributes:
*@li desired_channels:An optional int. Defaults to -1. Number of sample channels wanted.
*@li desired_samples:An optional int. Defaults to -1. Length of audio requested.

*@par Outputs:
*@li *audio:A Tensor of type float32.
*@li *sample_rate:A Tensor of type int32.

*@attention Constraints: \n
*-The implementation for DecodeWav on Ascend uses AI CPU, with bad performance. \n

*@par Quantization supported or not
*Not supported
*@par Quantized inference supported or not
*Supported
*@par L2 convergence supported or not
*@par Multiple batches supported or not
*/

REG_OP(DecodeWav)
    .INPUT(contents, TensorType({DT_STRING}))
    .OUTPUT(audio, TensorType({DT_FLOAT}))
    .OUTPUT(sample_rate, TensorType({DT_INT32}))
    .ATTR(desired_channels, Int, -1)
    .ATTR(desired_samples, Int, -1)
    .OP_END_FACTORY_REG(DecodeWav)

REG_OP(EncodeWav)
    .INPUT(audio, TensorType({DT_FLOAT}))
    .INPUT(sample_rate, TensorType({DT_INT32}))
    .OUTPUT(contents, TensorType({DT_STRING}))
    .OP_END_FACTORY_REG(EncodeWav)
}   // namespace ge

#endif  // GE_OP_AUDIO_OPS_H_