From f49a21d293f577dd3f57b9156930a850206d114f Mon Sep 17 00:00:00 2001
From: dingpeifei <dingpeifei1@huawei.com>
Date: Mon, 5 Jul 2021 16:53:45 +0800
Subject: [PATCH] code_sync_0705_inc

---
 inc/external/acl/acl.h                             |   4 +-
 inc/external/acl/ops/acl_dvpp.h                    |  97 +++++++++++++
 third_party/fwkacllib/inc/ops/array_ops.h          |   7 +-
 .../fwkacllib/inc/ops/elewise_calculation_ops.h    |  62 ++++-----
 third_party/fwkacllib/inc/ops/image_ops.h          | 152 +++++++++++++++++++++
 .../fwkacllib/inc/ops/matrix_calculation_ops.h     |  22 +--
 third_party/fwkacllib/inc/ops/nn_detect_ops.h      |  33 ++---
 third_party/fwkacllib/inc/ops/nn_norm_ops.h        |  30 ++--
 third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h  |  32 ++---
 third_party/fwkacllib/inc/ops/pad_ops.h            |   6 +-
 third_party/fwkacllib/inc/ops/random_ops.h         |  24 ++++
 third_party/fwkacllib/inc/ops/transformation_ops.h |  18 +--
 third_party/fwkacllib/inc/runtime/event.h          |  20 ++-
 13 files changed, 397 insertions(+), 110 deletions(-)

diff --git a/inc/external/acl/acl.h b/inc/external/acl/acl.h
index 8d261201..a5194472 100644
--- a/inc/external/acl/acl.h
+++ b/inc/external/acl/acl.h
@@ -25,9 +25,9 @@
 extern "C" {
 #endif
 
-// Current version is 1.0.0
+// Current version is 1.1.0
 #define ACL_MAJOR_VERSION 1
-#define ACL_MINOR_VERSION 0
+#define ACL_MINOR_VERSION 1
 #define ACL_PATCH_VERSION 0
 
 /**
diff --git a/inc/external/acl/ops/acl_dvpp.h b/inc/external/acl/ops/acl_dvpp.h
index dcaa3936..3c0723c5 100644
--- a/inc/external/acl/ops/acl_dvpp.h
+++ b/inc/external/acl/ops/acl_dvpp.h
@@ -158,6 +158,20 @@ enum acldvppJpegFormat {
   ACL_JPEG_CSS_UNKNOWN = 1000
 };
 
+enum acldvppChannelDescParamType { ACL_DVPP_CSC_MATRIX_UINT32 = 0 };
+
+enum aclvdecChannelDescParamType { ACL_VDEC_CSC_MATRIX_UINT32 = 0 };
+
+// Csc Matrix can be used both for acldvppChannelDescParamType and aclvdecChannelDescParamType
+enum acldvppCscMatrix {
+  ACL_DVPP_CSC_MATRIX_BT601_WIDE = 0,
+  ACL_DVPP_CSC_MATRIX_BT601_NARROW,
+  ACL_DVPP_CSC_MATRIX_BT709_WIDE,
+  ACL_DVPP_CSC_MATRIX_BT709_NARROW,
+  ACL_DVPP_CSC_MATRIX_BT2020_WIDE,
+  ACL_DVPP_CSC_MATRIX_BT2020_NARROW
+};
+
 /**
  * @ingroup AscendCL
  * @brief alloc device memory for dvpp.
@@ -2560,7 +2574,90 @@ ACL_FUNC_VISIBILITY aclError acldvppVpcBatchCropResizeMakeBorderAsync(
   acldvppChannelDesc *channelDesc, acldvppBatchPicDesc *srcBatchPicDescs, uint32_t *roiNums, uint32_t size,
   acldvppBatchPicDesc *dstBatchPicDescs, acldvppRoiConfig *cropAreas[], acldvppBorderConfig *borderCfgs[],
   acldvppResizeConfig *resizeConfig, aclrtStream stream);
+/**
+ * @ingroup AscendCL
+ * @brief set param for dvpp channel desc
+ *
+ * @par Function
+ * set attribution in dvpp channelDesc for specified type
+ *
+ * @param channelDesc [OUT]             the channel destruction
+ * @param paramType [IN]                specified param type
+ * @param length [IN]                   mem length of param
+ * @param param [IN]                    pointer to param
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acldvppGetChannelDescParam | acldvppCreateChannelDesc | acldvppDestroyChannelDesc
+ */
+ACL_FUNC_VISIBILITY aclError acldvppSetChannelDescParam(acldvppChannelDesc *channelDesc,
+                                                        acldvppChannelDescParamType paramType, size_t length,
+                                                        const void *param);
+
+/**
+ * @ingroup AscendCL
+ * @brief get param of dvpp channel desc
+ *
+ * @par Function
+ * get attribution value in dvpp channelDesc for specified type
+ *
+ * @param channelDesc [IN]              the channel destruction
+ * @param paramType [IN]                specified param type
+ * @param length [IN]                   mem length allocated for output param
+ * @param paramRetSize [OUT]            mem length of output param
+ * @param param [OUT]                   pointer to output param
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see acldvppSetChannelDescParam | acldvppCreateChannelDesc | acldvppDestroyChannelDesc
+ */
+ACL_FUNC_VISIBILITY aclError acldvppGetChannelDescParam(const acldvppChannelDesc *channelDesc,
+                                                        acldvppChannelDescParamType paramType, size_t length,
+                                                        size_t *paramRetSize, void *param);
+/**
+ * @ingroup AscendCL
+ * @brief set param for vdec channel desc
+ *
+ * @par Function
+ * set attribution in channelDesc for specified type
+ *
+ * @param channelDesc [OUT]             the vdec channel destruction
+ * @param paramType [IN]                specified param type
+ * @param length [IN]                   mem length of param
+ * @param param [IN]                    pointer to param
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see aclvdecGetChannelDescParam | aclvdecCreateChannelDesc | aclvdecDestroyChannelDesc
+ */
+ACL_FUNC_VISIBILITY aclError aclvdecSetChannelDescParam(aclvdecChannelDesc *channelDesc,
+                                                        aclvdecChannelDescParamType paramType, size_t length,
+                                                        const void *param);
 
+/**
+ * @ingroup AscendCL
+ * @brief get param of vdec channel desc
+ *
+ * @par Function
+ * get attribution value in channelDesc for specified type
+ *
+ * @param channelDesc [IN]              the vdec channel destruction
+ * @param paramType [IN]                specified param type
+ * @param length [IN]                   mem length allocated for output param
+ * @param paramRetSize [OUT]            mem length of output param
+ * @param param [OUT]                   pointer to output param
+ *
+ * @retval ACL_SUCCESS The function is successfully executed.
+ * @retval OtherValues Failure
+ *
+ * @see aclvdecSetChannelDescParam | aclvdecCreateChannelDesc | aclvdecDestroyChannelDesc
+ */
+ACL_FUNC_VISIBILITY aclError aclvdecGetChannelDescParam(const aclvdecChannelDesc *channelDesc,
+                                                        aclvdecChannelDescParamType paramType, size_t length,
+                                                        size_t *paramRetSize, void *param);
 #ifdef __cplusplus
 }
 #endif
diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h
index fd35b546..c203b737 100644
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -1154,18 +1154,17 @@ REG_OP(EditDistance)
     .OP_END_FACTORY_REG(EditDistance)
 
 /**
-* @brief sort_v2.
+* @brief sort the input tensor without returning the value of index.
 
 * @par Inputs:
-* @li x: An ND tensor of type float16.
+* x: An ND tensor of type float16.
 
 * @par Attributes:
-
 * @li axis: An optional int. The dimension to sort along. This value defaults to -1.
 * @li descending: An optional bool. Controls the sorting order (ascending or descending). This value defaults to False.
 
 * @par Outputs:
-* @li y: An ND tensor of type float16.
+* y: An ND tensor of type float16.
 
 * @attention Constraints:
 * @li Axis should select the last dim.
diff --git a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
index a20272f3..1f85c152 100644
--- a/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/elewise_calculation_ops.h
@@ -624,9 +624,9 @@ REG_OP(Log1p)
 
 *@attention Constraints:
 *@li x2: The input data does not support 0
-*@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
+*@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the
 *requirement of double thousandths in the mini form
-*@li Due to different architectures, the calculation results of this operator 
+*@li Due to different architectures, the calculation results of this operator
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8
 
@@ -2066,9 +2066,9 @@ REG_OP(FloorDiv)
 
 *@attention Constraints:
 *@li x2: The input data does not support 0
-*@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
+*@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the
 *requirement of double thousandths in the mini form
-*@li Due to different architectures, the calculation results of this operator 
+*@li Due to different architectures, the calculation results of this operator
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8
 
@@ -2200,9 +2200,9 @@ REG_OP(Tan)
 
 *@attention Constraints:
 *@li x2: The input data does not support 0
-*@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the 
+*@li When NUM exceeds 2048 , the accuracy of operator cannot guarantee the
 *requirement of double thousandths in the mini form
-*@li Due to different architectures, the calculation results of this operator 
+*@li Due to different architectures, the calculation results of this operator
 *on NPU and CPU may be inconsistent
 *@li If shape is expressed as (D1,D2... ,Dn), then D1*D2... *DN<=1000000,n<=8
 
@@ -3395,7 +3395,7 @@ REG_OP(TensorRedirect)
 * multiply the result by the scalar value and add it to tensor x1
 
 * @par Inputs:
-* Three inputs, including:
+* Four inputs, including:
 * @li input_data: A mutable input Tensor. Must be one of the following types:
 *     float16, float32.
 * @li x1: A mutable input Tensor of the same type as x1.
@@ -3404,7 +3404,7 @@ REG_OP(TensorRedirect)
 *     float16, float32, int32. \n
 
 * @par Outputs:
-* @li y: A mutable Tensor. Has the same type as "x1". \n
+* y: A mutable Tensor. Has the same type as "x1". \n
 
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Addcdiv.
@@ -3418,12 +3418,12 @@ REG_OP(Addcdiv)
     .OP_END_FACTORY_REG(Addcdiv)
 
 /**
-* @brief Performs the element-wise multiplication of tensor x2 by tensor x3, 
-* multiply the result by the scalar value and add it to tensor input_data 
+* @brief Performs the element-wise multiplication of tensor x2 by tensor x3,
+* multiply the result by the scalar value and add it to tensor input_data
 
 
 * @par Inputs:
-* Three inputs, including:
+* Four inputs, including:
 * @li input_data: A mutable input Tensor. Must be one of the following types:
 *     float16, float32, int8, int32, uint8.
 * @li x1: A mutable input Tensor of the same type as x1.
@@ -3431,7 +3431,7 @@ REG_OP(Addcdiv)
 * @li value: A tensor which includes only one element of the same type as x1. \n
 
 * @par Outputs:
-* @li y: A mutable output Tensor. Has the same type as "x1". \n
+* y: A mutable output Tensor. Has the same type as "x1". \n
 
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Addcmul.
@@ -3453,7 +3453,7 @@ REG_OP(Addcmul)
 * @li alpha: A scalar tensor of type float16, float32. \n
 
 * @par Outputs:
-* @li y: An ND tensor tensor with the same shape and type as "x1". \n
+* y: An ND tensor tensor with the same shape and type as "x1". \n
 
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Axpy.
@@ -3533,21 +3533,21 @@ REG_OP(TensorEqual)
     .OP_END_FACTORY_REG(TensorEqual)
 
 /**
- * @brief Element-wise min of each of the input tensors (with Numpy-style broadcasting support). 
- * All inputs and outputs must have the same data type. This operator supports multidirectional 
+ * @brief Element-wise min of each of the input tensors (with Numpy-style broadcasting support).
+ * All inputs and outputs must have the same data type. This operator supports multidirectional
  * (i.e., Numpy-style) broadcasting
- * 
- * @par inputs
+ *
+ * @par Inputs:
  * one input including:
- * @li x: dynamic input A Tensor. Must be one of the following types: float32, float16, double, int32, int64
- * 
- * @par output
+ * x: dynamic input A Tensor. Must be one of the following types: float32, float16, double, int32, int64
+ *
+ * @par Outputs:
  * one output including:
- * @li y:A Tensor of the same type as x
- * 
+ * y:A Tensor of the same type as x
+ *
  */
 REG_OP(MaxN)
-    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64})) 
+    .DYNAMIC_INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64}))
     .OUTPUT(y, TensorType({DT_FLOAT16, DT_FLOAT, DT_FLOAT64, DT_INT32, DT_INT64}))
     .OP_END_FACTORY_REG(MaxN)
 
@@ -3632,16 +3632,16 @@ REG_OP(DataCompare)
 *which Hardmax will be performed.The output tensor has the same shape and contains the Hardmax values of the
 *corresponding input.
 *
-*@par inputs
+*@par Inputs:
 *one input including:
-*@li x: input A Tensor.Must be one of the following types:float32,float16
+*x: input A Tensor.Must be one of the following types:float32,float16
 *
 *@par Attributes:
-*@li axis:A required int attribute that decides which dimension will be used to cal the hard_max
+*axis:A required int attribute that decides which dimension will be used to cal the hard_max
 *
-*@par output:
+*@par Outputs:
 *one output including:
-*@li y:A Tensor of the same type as x
+*y:A Tensor of the same type as x
 *
 */
 REG_OP(HardMax)
@@ -3669,7 +3669,7 @@ REG_OP(Dot)
     .INPUT(input_y, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
     .OUTPUT(output, TensorType({DT_FLOAT, DT_FLOAT16, DT_UINT8, DT_INT8, DT_INT32}))
     .OP_END_FACTORY_REG(Dot)
-	
+
 /**
 *@brief Returns a new tensor with boolean elements representing \n
 *if each element of input is “close” to the corresponding element of other \n
@@ -3717,7 +3717,7 @@ REG_OP(IsClose)
 *
 *@attention Constraints:
 *@li indices: only support int32,and shape same to "updates"
-*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". 
+*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x".
 *@li y:A Tensor, the type and shape is same to "var" \n
 
 *@par Third-party framework compatibility
@@ -3752,7 +3752,7 @@ REG_OP(ArgMaxGrad)
 
 *@attention Constraints:
 *@li indices: only support int32,and shape same to "updates"
-*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x". 
+*@li The value range of "dimension" is [-dims, dims - 1]. "dims" is the dimension length of "x".
 *@li y:A Tensor, the type and shape is same to "var" \n
 
 *@par Third-party framework compatibility
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 28bf6228..0a796cd6 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -1512,6 +1512,9 @@ REG_OP(IMGWarp)
 
 *@par Outputs:
 *map_img: A Tensor after resize. \n
+
+*@par Restrictions:
+*Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(Remap)
     .INPUT(img, TensorType({DT_UINT8, DT_FLOAT16, DT_FLOAT32}))
@@ -1848,6 +1851,9 @@ REG_OP(GridUnnormal)
 
 *@par Outputs:
 *y: Returns 4-D Tensor with the same dtype as `x`.
+
+*@par Restrictions:
+*Warning:THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(ImageUnfold)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -1940,5 +1946,151 @@ REG_OP(GridSampler3DGrad)
     .ATTR(align_corners, Bool, false)
     .OP_END_FACTORY_REG(GridSampler3DGrad)
 
+/**
+*@brief Upsample the 3-D data with the nearest neighbor ​interpolation algorithm. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li x: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
+*     float32, float64. \n
+
+*@par Attributes:
+*@li output_size: An optional listInt. Defaults to none.
+    contain 3 elements: output_depth, output_height, output_width. The number of elements of 'output_size'
+    should be the same as the rank of input 'x'. Only one of 'scales' and 'output_size' can be specified. \n
+*@li scales: An optional listFloat. Defaults to none.
+    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width. 
+    The number of elements of 'scales' should be the same as the rank of input 'x'. One of 'scales' and
+    'output_size' MUST be specified and it is an error if both are specified. \n
+
+*@par Outputs:
+*y: A 5-D tensor. Has the same type as input x, shape depends on x and output_size/scales. \n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. \n
+*/
+
+REG_OP(UpsampleNearest3d)
+    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .ATTR(output_size, ListInt, {})
+    .ATTR(scales, ListFloat, {})
+    .OP_END_FACTORY_REG(UpsampleNearest3d)
+
+/**
+*@brief Upsample the 3-D data with the trilinear ​interpolation algorithm. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li x: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
+*     float32, float64. \n
+
+*@par Attributes:
+*@li output_size: An optional listInt. Defaults to none.
+    contain 3 elements: output_depth, output_height, output_width. The number of elements of 'output_size' should
+    be the same as the rank of input 'x'. Only one of 'scales' and 'output_size' can be specified. \n
+*@li scales: An optional listFloat. Defaults to none.
+    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width.
+    The number of elements of 'scales' should be the same as the rank of input 'x'.
+    One of 'scales' and 'output_size' MUST be specified and it is an error if both are specified. \n
+*@li align_corners: An optional bool. Defaults to false.
+    If true, the input and output tensors are aligned by the center points of their corner pixels, preserving the
+    values at the corner pixels. If false, the input and output tensors are aligned by the corner points of their
+    corner pixels, and the interpolation use edge value padding for out of boundary values. \n
+
+*@par Outputs:
+*y: A 5-D tensor. Has the same type as input x, shape depends on x and output_size/scales. \n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use. \n
+*/
+
+REG_OP(UpsampleTrilinear3d)
+    .INPUT(x, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .ATTR(output_size, ListInt, {})
+    .ATTR(scales, ListFloat, {})
+    .ATTR(align_corners, Bool, false)
+    .OP_END_FACTORY_REG(UpsampleTrilinear3d)
+
+/**
+*@brief Upsample the 3-D gradient data  with the nearest neighbor ​interpolation algorithm. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li grad_output: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
+*     float32, float64. \n
+
+*@par Attributes:
+*@li input_size: An required listInt.
+    contain 5 elements: [min_batch, channels, depth, height, width]. Must:
+      input_size[0] == grad_output_tensor_size[0]
+      input_size[1] == grad_output_tensor_size[1]. \n
+*@li output_size: An optional listInt. Defaults to none.
+    contain 3 elements: depth, height, width. The number of elements of 'output_size' should
+    be the same as the rank of input 'grad_output'. Only one of 'scales' and 'output_size' can be specified. Must:
+      grad_output_tensor_size[2] == floor(input_size[2] * scales[0]) == output_size[0]
+      grad_output_tensor_size[3] == floor(input_size[3] * scales[1]) == output_size[1]
+      grad_output_tensor_size[4] == floor(input_size[4] * scales[2]) == output_size[2]. \n
+*@li scales: An optional listFloat. Defaults to none.
+    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width. 
+    The number of elements of 'scales' should be the same as the rank of input 'grad_output'.
+    One of 'scales' and 'output_size' MUST be specified and it is an error if both are specified. \n
+
+*@par Outputs:
+*y: A 5-D tensor. Has the same type as input grad_output, shape depends on Attributes:input_size. \n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
+
+REG_OP(UpsampleNearest3dGrad)
+    .INPUT(grad_output, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .REQUIRED_ATTR(input_size, ListInt)
+    .ATTR(output_size, ListInt, {})
+    .ATTR(scales, ListFloat, {})
+    .OP_END_FACTORY_REG(UpsampleNearest3dGrad)
+
+/**
+*@brief Upsample the 3-D gradient data  trilinear ​interpolation algorithm. \n
+
+*@par Inputs:
+*One inputs, including:
+* @li grad_output: A 5-D input tensor [N, C, D, H, W]. Must be one of the following types:
+*     float32, float64. \n
+
+*@par Attributes:
+*@li input_size: An required listInt.
+    contain 5 elements: [min_batch, channels, depth, height, width]. Must:
+      input_size[0] == grad_output_tensor_size[0]
+      input_size[1] == grad_output_tensor_size[1]. \n
+*@li output_size: An optional listInt. Defaults to none.
+    contain 3 elements: depth, height, width. The number of elements of 'output_size' should
+    be the same as the rank of input 'grad_output'. Only one of 'scales' and 'output_size' can be specified. Must:
+      grad_output_tensor_size[2] == floor(input_size[2] * scales[0]) == output_size[0]
+      grad_output_tensor_size[3] == floor(input_size[3] * scales[1]) == output_size[1]
+      grad_output_tensor_size[4] == floor(input_size[4] * scales[2]) == output_size[2]. \n
+*@li scales: An optional listFloat. Defaults to none.
+    The scale array along each dimension, contain 3 elements: scale_depth, scale_height, scale_width. 
+    The number of elements of 'scales' should be the same as the rank of input 'grad_output'.
+    One of 'scales' and 'output_size' MUST be specified and it is an error if both are specified. \n
+
+*@par Outputs:
+*y: A Tensor with shape depends on intput_size and output_size/scales. Must be one of the following
+    types: float16, float32. \n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
+
+REG_OP(UpsampleTrilinear3dGrad)
+    .INPUT(grad_output, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_DOUBLE}))
+    .REQUIRED_ATTR(input_size, ListInt)
+    .ATTR(output_size, ListInt, {})
+    .ATTR(scales, ListFloat, {})
+    .ATTR(align_corners, Bool, false)
+    .OP_END_FACTORY_REG(UpsampleTrilinear3dGrad)
 }  // namespace ge
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_IMAGE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
index 5341a95c..336d71a9 100644
--- a/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
+++ b/third_party/fwkacllib/inc/ops/matrix_calculation_ops.h
@@ -1120,11 +1120,12 @@ REG_OP(IndexAdd)
 *@brief: Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input \n
 
 *@par Inputs:
-* Two inputs, including:
-*@li x: A Tensor. Must be one of the following types:
-*    float16, float32, double, int32, uint8, int16, int8, complex64, int64,
-*    qint8, quint8, qint32, uint16, complex128, uint32, uint64.
-*@li diagonal:(int, optional) – the diagonal to consider。\n
+*x: A Tensor. Must be one of the following types:
+*float16, float32, double, int32, uint8, int16, int8, complex64, int64,
+*qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n
+
+*@par Attributes:
+*diagonal: An optional attribute indicates the diagonal to consider. \n
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n
@@ -1142,11 +1143,12 @@ REG_OP(Triu)
 *@brief: Returns the upper triangular part of a matrix (2-D tensor) or batch of matrices input \n
 
 *@par Inputs:
-* Two inputs, including:
-*@li x: A Tensor. Must be one of the following types:
-*    float16, float32, double, int32, uint8, int16, int8, complex64, int64,
-*    qint8, quint8, qint32, uint16, complex128, uint32, uint64.
-*@li diagonal:(int, optional) – the diagonal to consider。\n
+*x: A Tensor. Must be one of the following types:
+*float16, float32, double, int32, uint8, int16, int8, complex64, int64,
+*qint8, quint8, qint32, uint16, complex128, uint32, uint64. \n
+
+*@par Attributes:
+*diagonal: An optional attribute indicates the diagonal to consider. \n
 
 *@par Outputs:
 *y: A Tensor. Has the same type as "x" . \n
diff --git a/third_party/fwkacllib/inc/ops/nn_detect_ops.h b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
index 9f35e27a..ec50aa2e 100644
--- a/third_party/fwkacllib/inc/ops/nn_detect_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_detect_ops.h
@@ -1445,16 +1445,16 @@ REG_OP(DecodeBboxV2)
     .OP_END_FACTORY_REG(DecodeBboxV2)
 
 /**
-*@brief Computes sort function.
+*@brief sort the input tensor and return the value of index.
 *
 *@par Inputs:
 *Inputs include:
-* x: A Tensor. Dtype support: flaot16, flaot, int16, int8,
+* x: A Tensor. Dtype support: float16, float, int16, int8,
                           uint8, int32, int64.
 *
 *@par Attributes:
-* @li axis: optional, int.
-* @li descending: optional,bool.
+* @li axis: An optional attribute indicates the sorting axis.
+* @li descending: An optional attribute indicates desending sort or not.
 *
 *@par Outputs:
 * @li y1: A Tensor. Must have the same type as x.
@@ -1515,10 +1515,10 @@ whether boxes overlap too much with respect to IOU.
 deciding when to remove boxes based on score . \n
 
 *@par Attributes:
-*center_point_box:Integer indicate the format of the box data. 
-The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] 
-where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair 
-of box corners and the coordinates can be provided as normalized 
+*center_point_box:Integer indicate the format of the box data.
+The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2]
+where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair
+of box corners and the coordinates can be provided as normalized
 (i.e., lying in the interval [0, 1]) or absolute.Mostly used for TF models.
 1 - the box data is supplied as [x_center, y_center, width, height].
  Mostly used for Pytorch models. \n
@@ -1567,16 +1567,18 @@ deciding when to remove boxes based on score . \n
 the last dim representing (batch_id,class_id,index_id)  . \n
 
 *@par Attributes:
-*center_point_box:Integer indicate the format of the box data. 
-The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2] 
-where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair 
-of box corners and the coordinates can be provided as normalized 
+*@li center_point_box:Integer indicate the format of the box data.
+The default is 0. 0 - the box data is supplied as [y1, x1, y2, x2]
+where (y1, x1) and (y2, x2) are the coordinates of any diagonal pair
+of box corners and the coordinates can be provided as normalized
 (i.e., lying in the interval [0, 1]) or absolute.Mostly used for TF models.
 1 - the box data is supplied as [x_center, y_center, width, height].
- Mostly used for Pytorch models. \n
+ Mostly used for Pytorch models.
+*@li max_boxes_size: An optional attribute integer representing the real maximum
+*number of boxes to be selected by non max suppression . \n
 
 *@par Outputs:
-*@li selected_indices: A 2-D integer tensor of shape [M] representing the
+*selected_indices: A 2-D integer tensor of shape [M] representing the
 selected indices from the boxes tensor, where M <= max_output_size. \n
 
 *@attention Constraints:
@@ -1602,7 +1604,7 @@ REG_OP(NonMaxSuppressionV7)
 *@brief Obtains the ROI feature matrix from the feature map list. It is a customized fused operator for mmdetection. \n
 
 *@par Inputs:
-* Three inputs, including:
+* Two inputs, including:
 *@li features: A 5HD Tensor list of type float32 or float16.
 *@li rois: ROI position. A 2D Tensor of float32 or float16 with shape (N, 5). "N" indicates the number of ROIs,
 * the value "5" indicates the indexes of images where the ROIs are located, "x0", "y0", "x1", and "y1".
@@ -1818,4 +1820,3 @@ REG_OP(GridAssignPositive)
 }  // namespace ge
 
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_NN_DETECT_OPS_H_
-
diff --git a/third_party/fwkacllib/inc/ops/nn_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
index 10047d55..66d17bc8 100644
--- a/third_party/fwkacllib/inc/ops/nn_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_norm_ops.h
@@ -568,7 +568,7 @@ REG_OP(LayerNorm)
     .OP_END_FACTORY_REG(LayerNorm)
 
 /**
-*@brief Returns a tensor where each sub-tensor of input along dimension 
+*@brief Returns a tensor where each sub-tensor of input along dimension
 *       dim is normalized such that the p-norm of the sub-tensor is lower than the value maxnorm. \n
 
 *@par Inputs:
@@ -576,7 +576,7 @@ REG_OP(LayerNorm)
 * @li x: A Tensor. Must be one of the following types: float16, float32 . \n
 
 *@par Attributes:
-* @li p: Specify L_p norm, the type is float. 
+* @li p: Specify L_p norm, the type is float.
 * @li dim: The processed dim, the type is int.
 * @li maxnorm: Threshold for comparison, the type is float.  \n
 
@@ -1543,14 +1543,14 @@ REG_OP(SigmoidCrossEntropyWithLogitsGradV2)
     .ATTR(reduction, String, "mean")
     .OP_END_FACTORY_REG(SigmoidCrossEntropyWithLogitsGradV2)
 /**
- * @brief Calculate the PoissonNllLoss function. 
+ * @brief Calculate the PoissonNllLoss function.
  *        target∼Poisson(input)loss(input,target)=input−target∗log(input)+log(target!) \n
 
  * @par Inputs:
  * Two inputs, including:
  * @li input_x: A tensor. Must be one of the following types:
  *     float16, float32. \n
- * 
+ *
  * @par Inputs:
  * @li target: A tensor. Must be one of the following types:
  *     float16, float32. \n
@@ -1558,13 +1558,13 @@ REG_OP(SigmoidCrossEntropyWithLogitsGradV2)
  * @par Attributes:
  * four Attributes, including:
  * @li log_input: An optional bool. Defaults to "True" \n
- * 
+ *
  *  @par Attributes:
  * @li full: An optional bool. Defaults to "False" \n
- * 
+ *
  *  @par Attributes:
  * @li eps: An optional float. Defaults to "1e-8" \n
- * 
+ *
  *  @par Attributes:
  * @li reduction: An optional string. Defaults to "mean" \n
 
@@ -1592,7 +1592,7 @@ REG_OP(PoissonNllLoss)
  * @li num_step: A required int.\n
  * @li hidden_size: A required int. \n
  *
- * 
+ *
  * @par Output:
  * y: A mutable Tensor of type float16, with the shape of [num_step, batch_size, hidden_size]. \n
  *
@@ -1605,24 +1605,22 @@ REG_OP(RnnGenMask)
     .OP_END_FACTORY_REG(RnnGenMask)
 
 /**
-* @brief Creates a criterion that optimizes a multi-class multi-classification hinge loss (margin-based loss) 
+* @brief Creates a criterion that optimizes a multi-class multi-classification hinge loss (margin-based loss)
 *        between input x (a 2D mini-batch Tensor) and output y (which is a 2D Tensor of target class indices) \n
- 
+
 * @par Inputs:
 * Two inputs, including:
 * @li x: A tensor. Must be one of the following types:
-*     float16, float32. \n
-* 
-* @par Inputs:
+*     float16, float32.
 * @li target: A tensor. Must be the following types:
 *     int32. \n
 
 * @par Attributes:
-* @li reduction: An optional string. Defaults to "mean" \n
+* reduction: An optional string. Defaults to "mean" \n
 
 * @par Outputs:
-* y: A Tensor has same element type as input x. \n
-* is_target: A Tensor has same element type as input target. \n
+* @li y: A Tensor has same element type as input x. \n
+* @li is_target: A Tensor has same element type as input target. \n
 
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator MultiLabelMarginLoss. \n
diff --git a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
index b9df706b..bf850019 100644
--- a/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
+++ b/third_party/fwkacllib/inc/ops/nonlinear_fuc_ops.h
@@ -224,22 +224,22 @@ REG_OP(Relu6Grad)
     .OUTPUT(backprops, TensorType::RealNumberType())
     .OP_END_FACTORY_REG(Relu6Grad)
 /**
-*@brief Calculate the elu_grad_v2 function. 
+*@brief Calculate the elu_grad_v2 function.
 *Applies the element-wise function:
 * Computes the backward for the elu: if x>0, 1; otherwise elu() + alpha .
 *@par Inputs:
 *One inputs, including:
 * @li grads: A tensor. Must be one of the following types:
-*     float16, float32. 
+*     float16, float32.
 * @li activations: A tensor. Must be one of the following types:
-*     float16, float32. 
+*     float16, float32.
 *
 *@par Outputs:
 *y: A Tensor with the same type and shape of grads's.
-* 
+*
 *@par Attributes:
 *@li alpha: scalar parameter, default value = 1.0
-*/	
+*/
 REG_OP(EluGradV2)
     .INPUT(grads, TensorType({DT_FLOAT, DT_FLOAT16}))
     .INPUT(activations, TensorType({DT_FLOAT, DT_FLOAT16}))
@@ -539,24 +539,20 @@ REG_OP(Elu)
 *x: A float16, float32, for the input data type . \n
 
 *@par Attributes:
-*alpha1: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n
-
-*@par Attributes:
-*alpha2: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" . \n
-
-*@par Attributes:
-*alpha3: A float32. Defines at which positive value the ELU saturates. Defaults to "1.0" . \n
+*@li alpha1: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
+*@li alpha2: A float32. Defines at which negative value the ELU saturates. Defaults to "1.0" .
+*@li alpha3: A float32. Defines at which positive value the ELU saturates. Defaults to "1.0" . \n
 
 *@par Outputs:
 *y: A float16, float32, for the normalized result . \n
 
 *@attention Constraints:
-*@li The input is of type float16 or float32 . \n
+*The input is of type float16 or float32 . \n
 
 *@par Multiple batches supported or not
 *Supported
 *@par Third-party framework compatibility
-*@li Compatible with ONNX's Celu operator
+*Compatible with ONNX's Celu operator
 */
 REG_OP(Celu)
     .INPUT(x, TensorType({DT_FLOAT,DT_FLOAT16}))
@@ -808,15 +804,15 @@ REG_OP(SoftplusV2Grad)
 /**
  * @brief ThresholdedRelu takes one input data (Tensor) and produces one output data (Tensor)
  *  where the rectified linear function, y = x for x > alpha, y = 0 otherwise, is applied to the tensor elementwise.
- * 
+ *
  * @par inputs
  * one input including:
  * @li x: input A Tensor. Must be one of the following types: float32, float16
- * 
+ *
  * @par output
  * one output including:
  * @li y:A Tensor of the same type as x
- * 
+ *
  */
 REG_OP(ThresholdedRelu)
     .INPUT(x, TensorType({DT_FLOAT16, DT_FLOAT}))
@@ -892,7 +888,7 @@ REG_OP(HardShrink)
 
 * @par Third-party framework compatibility
 * Compatible with the Pytorch operator Hardsigmoid. \n
-*/    
+*/
 REG_OP(HardSigmoid)
     .INPUT(input_x, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT32}))
     .OUTPUT(output_y, TensorType({DT_FLOAT, DT_FLOAT16}))
diff --git a/third_party/fwkacllib/inc/ops/pad_ops.h b/third_party/fwkacllib/inc/ops/pad_ops.h
index 6854c866..4efb2683 100644
--- a/third_party/fwkacllib/inc/ops/pad_ops.h
+++ b/third_party/fwkacllib/inc/ops/pad_ops.h
@@ -213,11 +213,11 @@ REG_OP(PadV2)
 *@brief Pads a tensor . \n
 
 *@par Inputs:
-*x: A Tensor. Must be one of the following types: float16, float32, int32 . \n
-*constant_values: A Tensor. Must have the same type as input.
+*@li x: A Tensor. Must be one of the following types: float16, float32, int32 . \n
+*@li constant_values: A Tensor. Must have the same type as input.
 
 *@par Attributes:
-*paddings: An optional "vector<vector<int>>". Defaults to "{}".
+*paddings: A required Attribute.
 *     For each dimension D of input, paddings[D, 0] indicates how many
 *     values to add before the contents of tensor in that dimension,
 *     and paddings[D, 1] indicates how many values to add after the
diff --git a/third_party/fwkacllib/inc/ops/random_ops.h b/third_party/fwkacllib/inc/ops/random_ops.h
index afa3bb45..f607315a 100644
--- a/third_party/fwkacllib/inc/ops/random_ops.h
+++ b/third_party/fwkacllib/inc/ops/random_ops.h
@@ -584,6 +584,30 @@ REG_OP(DropoutV2)
     .OUTPUT(seed, TensorType({ DT_FLOAT }))
     .REQUIRED_ATTR(p, Float)
     .OP_END_FACTORY_REG(DropoutV2)
+
+/**
+* @brief The Bernoulli distribution with probability . \n
+
+* @par Inputs:
+* @li x: A ND Tensor. Must be one of the following data types: 
+         int8, uint8, int16, int32, int64, bool, float32, float64 . 
+* @li p: A ND Tensor. The probability of an element to be zeroed. 
+        Must be one of the following data types: float32, float64. \n
+
+* @par Attributes:
+* seed: An Integer, the seed of the random generator. Default value -1 
+    to use current timestamp, otherwise it should be a positive integer.
+
+* @par Outputs:
+* y: A tensor with the same shape and type as "x".
+*/
+
+REG_OP(Bernoulli)
+    .INPUT(x, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
+    .INPUT(p, TensorType({ DT_FLOAT, DT_DOUBLE }))
+    .OUTPUT(y, TensorType({ DT_INT8, DT_UINT8, DT_INT16, DT_INT32, DT_INT64, DT_BOOL, DT_FLOAT, DT_DOUBLE}))
+    .ATTR(seed, Int, -1)
+    .OP_END_FACTORY_REG(Bernoulli)
 }   // namespace ge
 
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_RANDOM_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/transformation_ops.h b/third_party/fwkacllib/inc/ops/transformation_ops.h
index 40fcf911..e096fd3e 100644
--- a/third_party/fwkacllib/inc/ops/transformation_ops.h
+++ b/third_party/fwkacllib/inc/ops/transformation_ops.h
@@ -182,14 +182,14 @@ REG_OP(Permute)
 * int8, uint8, int16, uint16, int32, uint32, int64,uint64, float16, float32.
 
 *@par Outputs:
-* y: A 2D flattened Tensor with the contents of the input tensor, with input dimensions up to axis flattened 
+* y: A 2D flattened Tensor with the contents of the input tensor, with input dimensions up to axis flattened
 * to the outer dimension of the output and remaining input dimensions flattened into the inner dimension of the output.
 * Must be one of the following data types: int8, uint8, int16, uint16, int32, uint32, int64,uint64, float16, float32 .
 
 *@par Attributes:
-* axis: A optional int32, default value is 1. Indicate up to which input dimensions (exclusive) should be flattened 
-* to the outer dimension of the output. The value for axis must be in the range [-r, r], where r is the rank of 
-* the input tensor. Negative value means counting dimensions from the back. When axis = 0, the shape of 
+* axis: A optional int32, default value is 1. Indicate up to which input dimensions (exclusive) should be flattened
+* to the outer dimension of the output. The value for axis must be in the range [-r, r], where r is the rank of
+* the input tensor. Negative value means counting dimensions from the back. When axis = 0, the shape of
 * the output tensor is (1, (d_0 X d_1 ... d_n), where the shape of the input tensor is (d_0, d_1, ... d_n).
 
 *@par Third-party framework compatibility
@@ -723,11 +723,13 @@ REG_OP(CompressFcOp)
 *@brief Performs Col2im for each batch entry. \n
 
 *@par Inputs:
-*@li input_x: The Col Tensor. 5-D, shape: `(n, c1, kernel_h*kernel_w, ho*wo, c0)`. 
-where ho/wo is do = (output_d + 2*padding_d - dilation_d*(kernel_d - 1) - 1)//stride_d + 1     \n
+*@li x: The Col Tensor. 4-D, shape: `(n, c, kernel_h*kernel_w, ho*wo)`.
+where ho/wo is do = (output_d + 2*padding_d - dilation_d*(kernel_d - 1) - 1)//stride_d + 1.
+*@li output_size: The img shape Tensor. 1-D, shape:`(2)`, value: (output_h, output_w).  \n
 
 *@par Outputs:
-*@li output_y: The img Tensor. 5-D, shape: `(n, c1, output_h, output_w, c0)`. \n
+*y: The img Tensor. 4-D, shape: `(n, c, output_h, output_w)`. \n
+
 
 *@par Attributes:
 *@li kernel_shape: ListInt, value: `(kernel_h, kernel_w)`, the shape of kernel in convolution.
@@ -837,7 +839,7 @@ REG_OP(AffineGrid)
 *@par Inputs:
 *Four inputs, including:
 *@li x: The input tensor.
-*@li size: The shape of output tensor. 
+*@li size: The shape of output tensor.
 *@li stride: The stride of output tensor.
 *@li storage_offset: The offset in the underlying storage of the output tensor. \n
 
diff --git a/third_party/fwkacllib/inc/runtime/event.h b/third_party/fwkacllib/inc/runtime/event.h
index 9e555230..57948c47 100644
--- a/third_party/fwkacllib/inc/runtime/event.h
+++ b/third_party/fwkacllib/inc/runtime/event.h
@@ -23,12 +23,18 @@
 extern "C" {
 #endif
 
+typedef enum rtEventWaitStatus {
+    EVENT_STATUS_COMPLETE = 0,
+    EVENT_STATUS_NOT_READY = 1,
+    EVENT_STATUS_MAX = 2,
+} rtEventWaitStatus_t;
+
 /**
  * @ingroup event_flags
  * @brief event op bit flags
  */
-#define RT_EVENT_DEFAULT (0x00)
-#define RT_EVENT_WITH_FLAG (0x01)
+#define RT_EVENT_DEFAULT (0x0E)
+#define RT_EVENT_WITH_FLAG (0x0B)
 
 #define RT_EVENT_DDSYNC_NS    0x01U
 #define RT_EVENT_STREAM_MARK  0x02U
@@ -111,6 +117,16 @@ RTS_API rtError_t rtEventQuery(rtEvent_t event);
 
 /**
  * @ingroup dvrt_event
+ * @brief Queries an event's wait status
+ * @param [in] event   event to query
+ * @param [in out] EVENT_WAIT_STATUS status
+ * @return EVENT_STATUS_COMPLETE for complete
+ * @return EVENT_STATUS_NOT_READY for not complete
+ */
+RTS_API rtError_t rtEventQueryWaitStatus(rtEvent_t event, rtEventWaitStatus_t *status);
+
+/**
+ * @ingroup dvrt_event
  * @brief computes the elapsed time between events.
  * @param [in] time   time between start and end in ms
  * @param [in] start  starting event