!1939 code_sync_0705_inc

Merge pull request !1939 from mindspore_ding/code_sync_0705
4 years ago · acc2472c41
--- a/inc/external/acl/acl.h
+++ b/inc/external/acl/acl.h
@@ -25,9 +25,9 @@
 extern "C" {
 #endif
 // Current version is 1.0.0
 // Current version is 1.1.0
 #define ACL_MAJOR_VERSION 1
 #define ACL_MINOR_VERSION 0
 #define ACL_MINOR_VERSION 1
 #define ACL_PATCH_VERSION 0
 /**
--- a/inc/external/acl/ops/acl_dvpp.h
+++ b/inc/external/acl/ops/acl_dvpp.h
@@ -158,6 +158,20 @@ enum acldvppJpegFormat {
  ACL_JPEG_CSS_UNKNOWN = 1000
 };
 enum acldvppChannelDescParamType { ACL_DVPP_CSC_MATRIX_UINT32 = 0 };
 enum aclvdecChannelDescParamType { ACL_VDEC_CSC_MATRIX_UINT32 = 0 };
 // Csc Matrix can be used both for acldvppChannelDescParamType and aclvdecChannelDescParamType
 enum acldvppCscMatrix {
  ACL_DVPP_CSC_MATRIX_BT601_WIDE = 0,
  ACL_DVPP_CSC_MATRIX_BT601_NARROW,
  ACL_DVPP_CSC_MATRIX_BT709_WIDE,
  ACL_DVPP_CSC_MATRIX_BT709_NARROW,
  ACL_DVPP_CSC_MATRIX_BT2020_WIDE,
  ACL_DVPP_CSC_MATRIX_BT2020_NARROW
 };
 /**
 * @ingroup AscendCL
 * @brief alloc device memory for dvpp.
@@ -2560,7 +2574,90 @@ ACL_FUNC_VISIBILITY aclError acldvppVpcBatchCropResizeMakeBorderAsync(
  acldvppChannelDesc *channelDesc, acldvppBatchPicDesc *srcBatchPicDescs, uint32_t *roiNums, uint32_t size,
  acldvppBatchPicDesc *dstBatchPicDescs, acldvppRoiConfig *cropAreas[], acldvppBorderConfig *borderCfgs[],
  acldvppResizeConfig *resizeConfig, aclrtStream stream);
 /**
 * @ingroup AscendCL
 * @brief set param for dvpp channel desc
 *
 * @par Function
 * set attribution in dvpp channelDesc for specified type
 *
 * @param channelDesc [OUT]             the channel destruction
 * @param paramType [IN]                specified param type
 * @param length [IN]                   mem length of param
 * @param param [IN]                    pointer to param
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acldvppGetChannelDescParam | acldvppCreateChannelDesc | acldvppDestroyChannelDesc
 */
 ACL_FUNC_VISIBILITY aclError acldvppSetChannelDescParam(acldvppChannelDesc *channelDesc,
                                                        acldvppChannelDescParamType paramType, size_t length,
                                                        const void *param);
 /**
 * @ingroup AscendCL
 * @brief get param of dvpp channel desc
 *
 * @par Function
 * get attribution value in dvpp channelDesc for specified type
 *
 * @param channelDesc [IN]              the channel destruction
 * @param paramType [IN]                specified param type
 * @param length [IN]                   mem length allocated for output param
 * @param paramRetSize [OUT]            mem length of output param
 * @param param [OUT]                   pointer to output param
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see acldvppSetChannelDescParam | acldvppCreateChannelDesc | acldvppDestroyChannelDesc
 */
 ACL_FUNC_VISIBILITY aclError acldvppGetChannelDescParam(const acldvppChannelDesc *channelDesc,
                                                        acldvppChannelDescParamType paramType, size_t length,
                                                        size_t *paramRetSize, void *param);
 /**
 * @ingroup AscendCL
 * @brief set param for vdec channel desc
 *
 * @par Function
 * set attribution in channelDesc for specified type
 *
 * @param channelDesc [OUT]             the vdec channel destruction
 * @param paramType [IN]                specified param type
 * @param length [IN]                   mem length of param
 * @param param [IN]                    pointer to param
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclvdecGetChannelDescParam | aclvdecCreateChannelDesc | aclvdecDestroyChannelDesc
 */
 ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescParam(aclvdecChannelDesc *channelDesc,
                                                        aclvdecChannelDescParamType paramType, size_t length,
                                                        const void *param);
 /**
 * @ingroup AscendCL
 * @brief get param of vdec channel desc
 *
 * @par Function
 * get attribution value in channelDesc for specified type
 *
 * @param channelDesc [IN]              the vdec channel destruction
 * @param paramType [IN]                specified param type
 * @param length [IN]                   mem length allocated for output param
 * @param paramRetSize [OUT]            mem length of output param
 * @param param [OUT]                   pointer to output param
 *
 * @retval ACL_SUCCESS The function is successfully executed.
 * @retval OtherValues Failure
 *
 * @see aclvdecSetChannelDescParam | aclvdecCreateChannelDesc | aclvdecDestroyChannelDesc
 */
 ACL_FUNC_VISIBILITY aclError aclvdecGetChannelDescParam(const aclvdecChannelDesc *channelDesc,
                                                        aclvdecChannelDescParamType paramType, size_t length,
                                                        size_t *paramRetSize, void *param);
 #ifdef __cplusplus
 }
 #endif
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -1154,18 +1154,17 @@ REG_OP(EditDistance)
    .OP_END_FACTORY_REG(EditDistance)
 /**
 * @brief sort_v2.
 * @brief sort the input tensor without returning the value of index.
 * @par Inputs:
 * @li x: An ND tensor of type float16.
 * x: An ND tensor of type float16.
 * @par Attributes:
 * @li axis: An optional int. The dimension to sort along. This value defaults to -1.
 * @li descending: An optional bool. Controls the sorting order (ascending or descending). This value defaults to False.
 * @par Outputs:
 * @li y: An ND tensor of type float16.
 * y: An ND tensor of type float16.
 * @attention Constraints:
 * @li Axis should select the last dim.
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -624,9 +624,9 @@ REG_OP(Log1p)
 *@attention Constraints:
 *@li x2: The input data does not support 0
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the
 *requirement of double thousandths in the mini form
 *@li Due to different architectures, the calculation results of this operator 
 *@li Due to different architectures, the calculation results of this operator
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8
@@ -2066,9 +2066,9 @@ REG_OP(FloorDiv)
 *@attention Constraints:
 *@li x2: The input data does not support 0
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the
 *requirement of double thousandths in the mini form
 *@li Due to different architectures, the calculation results of this operator 
 *@li Due to different architectures, the calculation results of this operator
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8
@@ -2200,9 +2200,9 @@ REG_OP(Tan)
 *@attention Constraints:
 *@li x2: The input data does not support 0
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
 *@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the
 *requirement of double thousandths in the mini form
 *@li Due to different architectures, the calculation results of this operator 
 *@li Due to different architectures, the calculation results of this operator
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8
@@ -3395,7 +3395,7 @@ REG_OP(TensorRedirect)
 * multiply the result by the scalar value and add it to tensor x1
 * @par Inputs:
 * Three inputs, including:
 * Four inputs, including:
 * @li input_data: A mutable input Tensor. Must be one of the following types:
 *     float16, float32.
 * @li x1: A mutable input Tensor of the same type as x1.
@@ -3404,7 +3404,7 @@ REG_OP(TensorRedirect)
 *     float16, float32, int32. \n
 * @par Outputs:
 * @li y: A mutable Tensor. Has the same type as "x1". \n
 * y: A mutable Tensor. Has the same type as "x1". \n
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Addcdiv.
@@ -3418,12 +3418,12 @@ REG_OP(Addcdiv)
    .OP_END_FACTORY_REG(Addcdiv)
 /**
 * @brief Performs the element-wise multiplication of tensor x2 by tensor x3, 
 * multiply the result by the scalar value and add it to tensor input_data 
 * @brief Performs the element-wise multiplication of tensor x2 by tensor x3,
 * multiply the result by the scalar value and add it to tensor input_data
 * @par Inputs:
 * Three inputs, including:
 * Four inputs, including:
 * @li input_data: A mutable input Tensor. Must be one of the following types:
 *     float16, float32, int8, int32, uint8.
 * @li x1: A mutable input Tensor of the same type as x1.
@@ -3431,7 +3431,7 @@ REG_OP(Addcdiv)
 * @li value: A tensor which includes only one element of the same type as x1. \n
 * @par Outputs:
 * @li y: A mutable output Tensor. Has the same type as "x1". \n
 * y: A mutable output Tensor. Has the same type as "x1". \n
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Addcmul.
@@ -3453,7 +3453,7 @@ REG_OP(Addcmul)
 * @li alpha: A scalar tensor of type float16, float32. \n
 * @par Outputs:
 * @li y: An ND tensor tensor with the same shape and type as "x1". \n
 * y: An ND tensor tensor with the same shape and type as "x1". \n
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Axpy.
@@ -3533,21 +3533,21 @@ REG_OP(TensorEqual)
    .OP_END_FACTORY_REG(TensorEqual)
 /**
 * @brief Element-wise min of each of the input tensors (with Numpy-style broadcasting support). 
 * All inputs and outputs must have the same data type. This operator supports multidirectional 
 * @brief Element-wise min of each of the input tensors (with Numpy-style broadcasting support).
 * All inputs and outputs must have the same data type. This operator supports multidirectional
 * (i.e., Numpy-style) broadcasting
 * 
 * @par inputs
 *
 * @par Inputs:
 * one input including:
 * @li x: dynamic input A Tensor. Must be one of the following types: float32, float16, double, int32, int64
 * 
 * @par output
 * x: dynamic input A Tensor. Must be one of the following types: float32, float16, double, int32, int64
 *
 * @par Outputs:
 * one output including:
 * @li y:A Tensor of the same type as x
 * 
 * y:A Tensor of the same type as x
 *
 */
 REG_OP(MaxN)
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64})) 
    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64}))
    .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64}))
    .OP_END_FACTORY_REG(MaxN)
@@ -3632,16 +3632,16 @@ REG_OP(DataCompare)
 *which Hardmax will be performed.The output tensor has the same shape and contains the Hardmax values of the
 *corresponding input.
 *
 *@par inputs
 *@par Inputs:
 *one input including:
 *@li x: input A Tensor.Must be one of the following types:float32,float16
 *x: input A Tensor.Must be one of the following types:float32,float16
 *
 *@par Attributes:
 *@li axis:A required int attribute that decides which dimension will be used to cal the hard_max
 *axis:A required int attribute that decides which dimension will be used to cal the hard_max
 *
 *@par output:
 *@par Outputs:
 *one output including:
 *@li y:A Tensor of the same type as x
 *y:A Tensor of the same type as x
 *
 */
 REG_OP(HardMax)
@@ -3669,7 +3669,7 @@ REG_OP(Dot)
    .INPUT(input_y, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
    .OUTPUT(output, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
    .OP_END_FACTORY_REG(Dot)
 /**
 *@brief Returns a new tensor with boolean elements representing \n
 *if each element of input is “close” to the corresponding element of other \n
@@ -3717,7 +3717,7 @@ REG_OP(IsClose)
 *
 *@attention Constraints:
 *@li indices: only support int32,and shape same to "updates"
 *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". 
 *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x".
 *@li y:A Tensor, the type and shape is same to "var" \n
 *@par Third-party framework compatibility
@@ -3752,7 +3752,7 @@ REG_OP(ArgMaxGrad)
 *@attention Constraints:
 *@li indices: only support int32,and shape same to "updates"
 *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". 
 *@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x".
 *@li y:A Tensor, the type and shape is same to "var" \n
 *@par Third-party framework compatibility
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -1512,6 +1512,9 @@ REG_OP(IMGWarp)
 *@par Outputs:
 *map_img: A Tensor after resize. \n
 *@par Restrictions:
 *Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(Remap)
    .INPUT(img, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT32}))
@@ -1848,6 +1851,9 @@ REG_OP(GridUnnormal)
 *@par Outputs:
 *y: Returns 4-D Tensor with the same dtype as `x`.
 *@par Restrictions:
 *Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ImageUnfold)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -1940,5 +1946,151 @@ REG_OP(GridSampler3DGrad)
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(GridSampler3DGrad)
 /**
 *@brief Upsample the 3-D data with the nearest neighbor interpolation algorithm. \n
 *@par Inputs:
 *One inputs, including:
 * @li x: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
 *     float32, float64. \n
 *@par Attributes:
 *@li output_size: An optional listInt. Defaults to none.
    contain 3 elements: output_depth, output_height, output_width. The number of elements of 'output_size'
    should be the same as the rank of input 'x'. Only one of 'scales' and 'output_size' can be specified. \n
 *@li scales: An optional listFloat. Defaults to none.
    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width. 
    The number of elements of 'scales' should be the same as the rank of input 'x'. One of 'scales' and
    'output_size' MUST be specified and it is an error if both are specified. \n
 *@par Outputs:
 *y: A 5-D tensor. Has the same type as input x, shape depends on x and output_size/scales. \n
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. \n
 */
 REG_OP(UpsampleNearest3d)
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(output_size, ListInt, {})
    .ATTR(scales, ListFloat, {})
    .OP_END_FACTORY_REG(UpsampleNearest3d)
 /**
 *@brief Upsample the 3-D data with the trilinear interpolation algorithm. \n
 *@par Inputs:
 *One inputs, including:
 * @li x: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
 *     float32, float64. \n
 *@par Attributes:
 *@li output_size: An optional listInt. Defaults to none.
    contain 3 elements: output_depth, output_height, output_width. The number of elements of 'output_size' should
    be the same as the rank of input 'x'. Only one of 'scales' and 'output_size' can be specified. \n
 *@li scales: An optional listFloat. Defaults to none.
    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width.
    The number of elements of 'scales' should be the same as the rank of input 'x'.
    One of 'scales' and 'output_size' MUST be specified and it is an error if both are specified. \n
 *@li align_corners: An optional bool. Defaults to false.
    If true, the input and output tensors are aligned by the center points of their corner pixels, preserving the
    values at the corner pixels. If false, the input and output tensors are aligned by the corner points of their
    corner pixels, and the interpolation use edge value padding for out of boundary values. \n
 *@par Outputs:
 *y: A 5-D tensor. Has the same type as input x, shape depends on x and output_size/scales. \n
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. \n
 */
 REG_OP(UpsampleTrilinear3d)
    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .ATTR(output_size, ListInt, {})
    .ATTR(scales, ListFloat, {})
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(UpsampleTrilinear3d)
 /**
 *@brief Upsample the 3-D gradient data  with the nearest neighbor interpolation algorithm. \n
 *@par Inputs:
 *One inputs, including:
 * @li grad_output: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
 *     float32, float64. \n
 *@par Attributes:
 *@li input_size: An required listInt.
    contain 5 elements: [min_batch, channels, depth, height, width]. Must:
      input_size[0] == grad_output_tensor_size[0]
      input_size[1] == grad_output_tensor_size[1]. \n
 *@li output_size: An optional listInt. Defaults to none.
    contain 3 elements: depth, height, width. The number of elements of 'output_size' should
    be the same as the rank of input 'grad_output'. Only one of 'scales' and 'output_size' can be specified. Must:
      grad_output_tensor_size[2] == floor(input_size[2] * scales[0]) == output_size[0]
      grad_output_tensor_size[3] == floor(input_size[3] * scales[1]) == output_size[1]
      grad_output_tensor_size[4] == floor(input_size[4] * scales[2]) == output_size[2]. \n
 *@li scales: An optional listFloat. Defaults to none.
    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width. 
    The number of elements of 'scales' should be the same as the rank of input 'grad_output'.
    One of 'scales' and 'output_size' MUST be specified and it is an error if both are specified. \n
 *@par Outputs:
 *y: A 5-D tensor. Has the same type as input grad_output, shape depends on Attributes:input_size. \n
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UpsampleNearest3dGrad)
    .INPUT(grad_output, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .REQUIRED_ATTR(input_size, ListInt)
    .ATTR(output_size, ListInt, {})
    .ATTR(scales, ListFloat, {})
    .OP_END_FACTORY_REG(UpsampleNearest3dGrad)
 /**
 *@brief Upsample the 3-D gradient data  trilinear interpolation algorithm. \n
 *@par Inputs:
 *One inputs, including:
 * @li grad_output: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
 *     float32, float64. \n
 *@par Attributes:
 *@li input_size: An required listInt.
    contain 5 elements: [min_batch, channels, depth, height, width]. Must:
      input_size[0] == grad_output_tensor_size[0]
      input_size[1] == grad_output_tensor_size[1]. \n
 *@li output_size: An optional listInt. Defaults to none.
    contain 3 elements: depth, height, width. The number of elements of 'output_size' should
    be the same as the rank of input 'grad_output'. Only one of 'scales' and 'output_size' can be specified. Must:
      grad_output_tensor_size[2] == floor(input_size[2] * scales[0]) == output_size[0]
      grad_output_tensor_size[3] == floor(input_size[3] * scales[1]) == output_size[1]
      grad_output_tensor_size[4] == floor(input_size[4] * scales[2]) == output_size[2]. \n
 *@li scales: An optional listFloat. Defaults to none.
    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width. 
    The number of elements of 'scales' should be the same as the rank of input 'grad_output'.
    One of 'scales' and 'output_size' MUST be specified and it is an error if both are specified. \n
 *@par Outputs:
 *y: A Tensor with shape depends on intput_size and output_size/scales. Must be one of the following
    types: float16, float32. \n
 *@par Restrictions:
 *Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UpsampleTrilinear3dGrad)
    .INPUT(grad_output, TensorType({DT_FLOAT, DT_DOUBLE}))
    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
    .REQUIRED_ATTR(input_size, ListInt)
    .ATTR(output_size, ListInt, {})
    .ATTR(scales, ListFloat, {})
    .ATTR(align_corners, Bool, false)
    .OP_END_FACTORY_REG(UpsampleTrilinear3dGrad)
 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -1120,11 +1120,12 @@ REG_OP(IndexAdd)
 *@brief: Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input \n
 *@par Inputs:
 * Two inputs, including:
 *@li x: A Tensor. Must be one of the following types:
 *    float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *    qint8, quint8, qint32, uint16, complex128, uint32, uint64.
 *@li diagonal:(int, optional) – the diagonal to consider。\n
 *x: A Tensor. Must be one of the following types:
 *float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n
 *@par Attributes:
 *diagonal: An optional attribute indicates the diagonal to consider. \n
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n
@@ -1142,11 +1143,12 @@ REG_OP(Triu)
 *@brief: Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input \n
 *@par Inputs:
 * Two inputs, including:
 *@li x: A Tensor. Must be one of the following types:
 *    float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *    qint8, quint8, qint32, uint16, complex128, uint32, uint64.
 *@li diagonal:(int, optional) – the diagonal to consider。\n
 *x: A Tensor. Must be one of the following types:
 *float16, float32, double, int32, uint8, int16, int8, complex64, int64,
 *qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n
 *@par Attributes:
 *diagonal: An optional attribute indicates the diagonal to consider. \n
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -1445,16 +1445,16 @@ REG_OP(DecodeBboxV2)
    .OP_END_FACTORY_REG(DecodeBboxV2)
 /**
 *@brief Computes sort function.
 *@brief sort the input tensor and return the value of index.
 *
 *@par Inputs:
 *Inputs include:
 * x: A Tensor. Dtype support: flaot16, flaot, int16, int8,
 * x: A Tensor. Dtype support: float16, float, int16, int8,
                          uint8, int32, int64.
 *
 *@par Attributes:
 * @li axis: optional, int.
 * @li descending: optional,bool.
 * @li axis: An optional attribute indicates the sorting axis.
 * @li descending: An optional attribute indicates desending sort or not.
 *
 *@par Outputs:
 * @li y1: A Tensor. Must have the same type as x.
@@ -1515,10 +1515,10 @@ whether boxes overlap too much with respect to IOU.
 deciding when to remove boxes based on score . \n
 *@par Attributes:
 *center_point_box:Integer indicate the format of the box data. 
 The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] 
 where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair 
 of box corners and the coordinates can be provided as normalized 
 *center_point_box:Integer indicate the format of the box data.
 The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2]
 where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair
 of box corners and the coordinates can be provided as normalized
 (i.e., lying in the interval [0, 1]) or absolute.Mostly used for TF models.
 1 - the box data is supplied as [x_center, y_center, width, height].
 Mostly used for Pytorch models. \n
@@ -1567,16 +1567,18 @@ deciding when to remove boxes based on score . \n
 the last dim representing (batch_id,class_id,index_id)  . \n
 *@par Attributes:
 *center_point_box:Integer indicate the format of the box data. 
 The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] 
 where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair 
 of box corners and the coordinates can be provided as normalized 
 *@li center_point_box:Integer indicate the format of the box data.
 The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2]
 where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair
 of box corners and the coordinates can be provided as normalized
 (i.e., lying in the interval [0, 1]) or absolute.Mostly used for TF models.
 1 - the box data is supplied as [x_center, y_center, width, height].
 Mostly used for Pytorch models. \n
 Mostly used for Pytorch models.
 *@li max_boxes_size: An optional attribute integer representing the real maximum
 *number of boxes to be selected by non max suppression . \n
 *@par Outputs:
 *@li selected_indices: A 2-D integer tensor of shape [M] representing the
 *selected_indices: A 2-D integer tensor of shape [M] representing the
 selected indices from the boxes tensor, where M <= max_output_size. \n
 *@attention Constraints:
@@ -1602,7 +1604,7 @@ REG_OP(NonMaxSuppressionV7)
 *@brief Obtains the ROI feature matrix from the feature map list. It is a customized fused operator for mmdetection. \n
 *@par Inputs:
 * Three inputs, including:
 * Two inputs, including:
 *@li features: A 5HD Tensor list of type float32 or float16.
 *@li rois: ROI position. A 2D Tensor of float32 or float16 with shape (N, 5). "N" indicates the number of ROIs,
 * the value "5" indicates the indexes of images where the ROIs are located, "x0", "y0", "x1", and "y1".
@@ -1818,4 +1820,3 @@ REG_OP(GridAssignPositive)
 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -568,7 +568,7 @@ REG_OP(LayerNorm)
    .OP_END_FACTORY_REG(LayerNorm)
 /**
 *@brief Returns a tensor where each sub-tensor of input along dimension 
 *@brief Returns a tensor where each sub-tensor of input along dimension
 *       dim is normalized such that the p-norm of the sub-tensor is lower than the value maxnorm. \n
 *@par Inputs:
@@ -576,7 +576,7 @@ REG_OP(LayerNorm)
 * @li x: A Tensor. Must be one of the following types: float16, float32 . \n
 *@par Attributes:
 * @li p: Specify L_p norm, the type is float. 
 * @li p: Specify L_p norm, the type is float.
 * @li dim: The processed dim, the type is int.
 * @li maxnorm: Threshold for comparison, the type is float.  \n
@@ -1543,14 +1543,14 @@ REG_OP(SigmoidCrossEntropyWithLogitsGradV2)
    .ATTR(reduction, String, "mean")
    .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogitsGradV2)
 /**
 * @brief Calculate the PoissonNllLoss function. 
 * @brief Calculate the PoissonNllLoss function.
 *        target∼Poisson(input)loss(input,target)=input−target∗log(input)+log(target!) \n
 * @par Inputs:
 * Two inputs, including:
 * @li input_x: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * 
 *
 * @par Inputs:
 * @li target: A tensor. Must be one of the following types:
 *     float16, float32. \n
@@ -1558,13 +1558,13 @@ REG_OP(SigmoidCrossEntropyWithLogitsGradV2)
 * @par Attributes:
 * four Attributes, including:
 * @li log_input: An optional bool. Defaults to "True" \n
 * 
 *
 *  @par Attributes:
 * @li full: An optional bool. Defaults to "False" \n
 * 
 *
 *  @par Attributes:
 * @li eps: An optional float. Defaults to "1e-8" \n
 * 
 *
 *  @par Attributes:
 * @li reduction: An optional string. Defaults to "mean" \n
@@ -1592,7 +1592,7 @@ REG_OP(PoissonNllLoss)
 * @li num_step: A required int.\n
 * @li hidden_size: A required int. \n
 *
 * 
 *
 * @par Output:
 * y: A mutable Tensor of type float16, with the shape of [num_step, batch_size, hidden_size]. \n
 *
@@ -1605,24 +1605,22 @@ REG_OP(RnnGenMask)
    .OP_END_FACTORY_REG(RnnGenMask)
 /**
 * @brief Creates a criterion that optimizes a multi-class multi-classification hinge loss (margin-based loss) 
 * @brief Creates a criterion that optimizes a multi-class multi-classification hinge loss (margin-based loss)
 *        between input x (a 2D mini-batch Tensor) and output y (which is a 2D Tensor of target class indices) \n
 * @par Inputs:
 * Two inputs, including:
 * @li x: A tensor. Must be one of the following types:
 *     float16, float32. \n
 * 
 * @par Inputs:
 *     float16, float32.
 * @li target: A tensor. Must be the following types:
 *     int32. \n
 * @par Attributes:
 * @li reduction: An optional string. Defaults to "mean" \n
 * reduction: An optional string. Defaults to "mean" \n
 * @par Outputs:
 * y: A Tensor has same element type as input x. \n
 * is_target: A Tensor has same element type as input target. \n
 * @li y: A Tensor has same element type as input x. \n
 * @li is_target: A Tensor has same element type as input target. \n
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator MultiLabelMarginLoss. \n
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -224,22 +224,22 @@ REG_OP(Relu6Grad)
    .OUTPUT(backprops, TensorType::RealNumberType())
    .OP_END_FACTORY_REG(Relu6Grad)
 /**
 *@brief Calculate the elu_grad_v2 function. 
 *@brief Calculate the elu_grad_v2 function.
 *Applies the element-wise function:
 * Computes the backward for the elu: if x>0, 1; otherwise elu() + alpha .
 *@par Inputs:
 *One inputs, including:
 * @li grads: A tensor. Must be one of the following types:
 *     float16, float32. 
 *     float16, float32.
 * @li activations: A tensor. Must be one of the following types:
 *     float16, float32. 
 *     float16, float32.
 *
 *@par Outputs:
 *y: A Tensor with the same type and shape of grads's.
 * 
 *
 *@par Attributes:
 *@li alpha: scalar parameter, default value = 1.0
 */	
 */
 REG_OP(EluGradV2)
    .INPUT(grads, TensorType({DT_FLOAT, DT_FLOAT16}))
    .INPUT(activations, TensorType({DT_FLOAT, DT_FLOAT16}))
@@ -539,24 +539,20 @@ REG_OP(Elu)
 *x: A float16, float32, for the input data type . \n
 *@par Attributes:
 *alpha1: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n
 *@par Attributes:
 *alpha2: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n
 *@par Attributes:
 *alpha3: A float32. Defines at which positive value the ELU saturates. Defaults to "1.0" . \n
 *@li alpha1: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
 *@li alpha2: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
 *@li alpha3: A float32. Defines at which positive value the ELU saturates. Defaults to "1.0" . \n
 *@par Outputs:
 *y: A float16, float32, for the normalized result . \n
 *@attention Constraints:
 *@li The input is of type float16 or float32 . \n
 *The input is of type float16 or float32 . \n
 *@par Multiple batches supported or not
 *Supported
 *@par Third-party framework compatibility
 *@li Compatible with ONNX's Celu operator
 *Compatible with ONNX's Celu operator
 */
 REG_OP(Celu)
    .INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16}))
@@ -808,15 +804,15 @@ REG_OP(SoftplusV2Grad)
 /**
 * @brief ThresholdedRelu takes one input data (Tensor) and produces one output data (Tensor)
 *  where the rectified linear function, y = x for x > alpha, y = 0 otherwise, is applied to the tensor elementwise.
 * 
 *
 * @par inputs
 * one input including:
 * @li x: input A Tensor. Must be one of the following types: float32, float16
 * 
 *
 * @par output
 * one output including:
 * @li y:A Tensor of the same type as x
 * 
 *
 */
 REG_OP(ThresholdedRelu)
    .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -892,7 +888,7 @@ REG_OP(HardShrink)
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Hardsigmoid. \n
 */    
 */
 REG_OP(HardSigmoid)
    .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
    .OUTPUT(output_y, TensorType({DT_FLOAT, DT_FLOAT16}))
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@@ -213,11 +213,11 @@ REG_OP(PadV2)
 *@brief Pads a tensor . \n
 *@par Inputs:
 *x: A Tensor. Must be one of the following types: float16, float32, int32 . \n
 *constant_values: A Tensor. Must have the same type as input.
 *@li x: A Tensor. Must be one of the following types: float16, float32, int32 . \n
 *@li constant_values: A Tensor. Must have the same type as input.
 *@par Attributes:
 *paddings: An optional "vector<vector<int>>". Defaults to "{}".
 *paddings: A required Attribute.
 *     For each dimension D of input, paddings[D, 0] indicates how many
 *     values to add before the contents of tensor in that dimension,
 *     and paddings[D, 1] indicates how many values to add after the
--- a/third_party/fwkacllib/inc/ops/random_ops.h
+++ b/third_party/fwkacllib/inc/ops/random_ops.h
@@ -584,6 +584,30 @@ REG_OP(DropoutV2)
    .OUTPUT(seed, TensorType({ DT_FLOAT }))
    .REQUIRED_ATTR(p, Float)
    .OP_END_FACTORY_REG(DropoutV2)
 /**
 * @brief The Bernoulli distribution with probability . \n
 * @par Inputs:
 * @li x: A ND Tensor. Must be one of the following data types: 
         int8, uint8, int16, int32, int64, bool, float32, float64 . 
 * @li p: A ND Tensor. The probability of an element to be zeroed. 
        Must be one of the following data types: float32, float64. \n
 * @par Attributes:
 * seed: An Integer, the seed of the random generator. Default value -1 
    to use current timestamp, otherwise it should be a positive integer.
 * @par Outputs:
 * y: A tensor with the same shape and type as "x".
 */
 REG_OP(Bernoulli)
    .INPUT(x, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
    .INPUT(p, TensorType({ DT_FLOAT, DT_DOUBLE }))
    .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
    .ATTR(seed, Int, -1)
    .OP_END_FACTORY_REG(Bernoulli)
 }   // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_RANDOM_OPS_H_
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -182,14 +182,14 @@ REG_OP(Permute)
 * int8, uint8, int16, uint16, int32, uint32, int64,uint64, float16, float32.
 *@par Outputs:
 * y: A 2D flattened Tensor with the contents of the input tensor, with input dimensions up to axis flattened 
 * y: A 2D flattened Tensor with the contents of the input tensor, with input dimensions up to axis flattened
 * to the outer dimension of the output and remaining input dimensions flattened into the inner dimension of the output.
 * Must be one of the following data types: int8, uint8, int16, uint16, int32, uint32, int64,uint64, float16, float32 .
 *@par Attributes:
 * axis: A optional int32, default value is 1. Indicate up to which input dimensions (exclusive) should be flattened 
 * to the outer dimension of the output. The value for axis must be in the range [-r, r], where r is the rank of 
 * the input tensor. Negative value means counting dimensions from the back. When axis = 0, the shape of 
 * axis: A optional int32, default value is 1. Indicate up to which input dimensions (exclusive) should be flattened
 * to the outer dimension of the output. The value for axis must be in the range [-r, r], where r is the rank of
 * the input tensor. Negative value means counting dimensions from the back. When axis = 0, the shape of
 * the output tensor is (1, (d_0 X d_1 ... d_n), where the shape of the input tensor is (d_0, d_1, ... d_n).
 *@par Third-party framework compatibility
@@ -723,11 +723,13 @@ REG_OP(CompressFcOp)
 *@brief Performs Col2im for each batch entry. \n
 *@par Inputs:
 *@li input_x: The Col Tensor. 5-D, shape: `(n, c1, kernel_h*kernel_w, ho*wo, c0)`. 
 where ho/wo is do = (output_d + 2*padding_d - dilation_d*(kernel_d - 1) - 1)//stride_d + 1     \n
 *@li x: The Col Tensor. 4-D, shape: `(n, c, kernel_h*kernel_w, ho*wo)`.
 where ho/wo is do = (output_d + 2*padding_d - dilation_d*(kernel_d - 1) - 1)//stride_d + 1.
 *@li output_size: The img shape Tensor. 1-D, shape:`(2)`, value: (output_h, output_w).  \n
 *@par Outputs:
 *@li output_y: The img Tensor. 5-D, shape: `(n, c1, output_h, output_w, c0)`. \n
 *y: The img Tensor. 4-D, shape: `(n, c, output_h, output_w)`. \n
 *@par Attributes:
 *@li kernel_shape: ListInt, value: `(kernel_h, kernel_w)`, the shape of kernel in convolution.
@@ -837,7 +839,7 @@ REG_OP(AffineGrid)
 *@par Inputs:
 *Four inputs, including:
 *@li x: The input tensor.
 *@li size: The shape of output tensor. 
 *@li size: The shape of output tensor.
 *@li stride: The stride of output tensor.
 *@li storage_offset: The offset in the underlying storage of the output tensor. \n
--- a/third_party/fwkacllib/inc/runtime/event.h
+++ b/third_party/fwkacllib/inc/runtime/event.h
@@ -23,12 +23,18 @@
 extern "C" {
 #endif
 typedef enum rtEventWaitStatus {
    EVENT_STATUS_COMPLETE = 0,
    EVENT_STATUS_NOT_READY = 1,
    EVENT_STATUS_MAX = 2,
 } rtEventWaitStatus_t;
 /**
 * @ingroup event_flags
 * @brief event op bit flags
 */
 #define RT_EVENT_DEFAULT (0x00)
 #define RT_EVENT_WITH_FLAG (0x01)
 #define RT_EVENT_DEFAULT (0x0E)
 #define RT_EVENT_WITH_FLAG (0x0B)
 #define RT_EVENT_DDSYNC_NS    0x01U
 #define RT_EVENT_STREAM_MARK  0x02U
@@ -111,6 +117,16 @@ RTS_API rtError_t rtEventQuery(rtEvent_t event);
 /**
 * @ingroup dvrt_event
 * @brief Queries an event's wait status
 * @param [in] event   event to query
 * @param [in out] EVENT_WAIT_STATUS status
 * @return EVENT_STATUS_COMPLETE for complete
 * @return EVENT_STATUS_NOT_READY for not complete
 */
 RTS_API rtError_t rtEventQueryWaitStatus(rtEvent_t event, rtEventWaitStatus_t *status);
 /**
 * @ingroup dvrt_event
 * @brief computes the elapsed time between events.
 * @param [in] time   time between start and end in ms
 * @param [in] start  starting event