From 60cbe0c4d5a11f32aa9be6981812679c90e79b6f Mon Sep 17 00:00:00 2001
From: yanghaoran <yanghaoran2@huawei.com>
Date: Thu, 18 Nov 2021 16:01:24 +0800
Subject: [PATCH] upgrade Ascend package 18 Nov 21

---
 inc/external/acl/error_codes/rt_error_codes.h      |  1 +
 inc/framework/common/ge_types.h                    |  1 +
 metadef                                            |  2 +-
 .../inc/external/runtime/rt_error_codes.h          |  9 ++-
 third_party/fwkacllib/inc/ops/array_ops.h          | 52 +++++++++--------
 third_party/fwkacllib/inc/ops/image_ops.h          | 21 ++++++-
 third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h  | 36 +++++++++---
 third_party/fwkacllib/inc/ops/ocr_ops.h            |  4 +-
 third_party/fwkacllib/inc/ops/reduce_ops.h         | 21 +++++++
 third_party/fwkacllib/inc/ops/vector_search.h      | 66 +++++++++++++++++++++-
 third_party/fwkacllib/inc/runtime/base.h           |  6 +-
 third_party/fwkacllib/inc/runtime/config.h         | 12 +++-
 third_party/fwkacllib/inc/runtime/dev.h            | 11 +++-
 third_party/fwkacllib/inc/runtime/kernel.h         | 16 ++++--
 third_party/fwkacllib/inc/runtime/mem.h            |  2 +-
 third_party/fwkacllib/inc/runtime/rt.h             |  3 +-
 third_party/fwkacllib/inc/toolchain/prof_acl_api.h | 12 ++--
 17 files changed, 217 insertions(+), 58 deletions(-)

diff --git a/inc/external/acl/error_codes/rt_error_codes.h b/inc/external/acl/error_codes/rt_error_codes.h
index 556652be..abfa30db 100644
--- a/inc/external/acl/error_codes/rt_error_codes.h
+++ b/inc/external/acl/error_codes/rt_error_codes.h
@@ -60,6 +60,7 @@ static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE = 207011;      // no cdq resou
 static const int32_t ACL_ERROR_RT_OVER_LIMIT = 207012;           // over limit
 static const int32_t ACL_ERROR_RT_QUEUE_EMPTY = 207013;          // queue is empty
 static const int32_t ACL_ERROR_RT_QUEUE_FULL = 207014;           // queue is full
+static const int32_t ACL_ERROR_RT_REPEATED_INIT = 207015;        // repeated init
 
 static const int32_t ACL_ERROR_RT_INTERNAL_ERROR = 507000;              // runtime internal error
 static const int32_t ACL_ERROR_RT_TS_ERROR = 507001;                    // ts internel error
diff --git a/inc/framework/common/ge_types.h b/inc/framework/common/ge_types.h
index dcfb4961..060a7bf0 100644
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -293,6 +293,7 @@ struct OpDescInfo {
   std::string dev_func;
   std::string tvm_magic;
   uint32_t tiling_key = 0U;
+  uintptr_t args = 0U;
   std::string tiling_data;
   std::string node_info;
   std::vector<int64_t> workspace_bytes;
diff --git a/metadef b/metadef
index 7d777404..fe47d04d 160000
--- a/metadef
+++ b/metadef
@@ -1 +1 @@
-Subproject commit 7d777404b3b7fe7daeaf00e566e431c6a05b040a
+Subproject commit fe47d04d75170006fc0d28538dec49a2da426ceb
diff --git a/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h b/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
index c5423d36..09ce1f65 100644
--- a/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
+++ b/third_party/fwkacllib/inc/external/runtime/rt_error_codes.h
@@ -58,6 +58,10 @@ static const int32_t ACL_ERROR_RT_NO_STREAM_RESOURCE         = 207008; // no str
 static const int32_t ACL_ERROR_RT_NO_NOTIFY_RESOURCE         = 207009; // no notify resource
 static const int32_t ACL_ERROR_RT_NO_MODEL_RESOURCE          = 207010; // no model resource
 static const int32_t ACL_ERROR_RT_NO_CDQ_RESOURCE            = 207011; // no cdq resource
+static const int32_t ACL_ERROR_RT_OVER_LIMIT                 = 207012; // over limit
+static const int32_t ACL_ERROR_RT_QUEUE_EMPTY                = 207013; // queue is empty
+static const int32_t ACL_ERROR_RT_QUEUE_FULL                 = 207014; // queue is full
+static const int32_t ACL_ERROR_RT_REPEATED_INIT              = 207015; // repeated init
 
 static const int32_t ACL_ERROR_RT_INTERNAL_ERROR             = 507000; // runtime internal error
 static const int32_t ACL_ERROR_RT_TS_ERROR                   = 507001; // ts internel error
@@ -97,6 +101,10 @@ static const int32_t ACL_ERROR_RT_VECTOR_CORE_TIMEOUT        = 507034; // vector
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_EXCEPTION      = 507035; // vector core exception
 static const int32_t ACL_ERROR_RT_VECTOR_CORE_TRAP_EXCEPTION = 507036; // vector core trap exception
 static const int32_t ACL_ERROR_RT_CDQ_BATCH_ABNORMAL         = 507037; // cdq alloc batch abnormal
+static const int32_t ACL_ERROR_RT_DIE_MODE_CHANGE_ERROR      = 507038; // can not change die mode
+static const int32_t ACL_ERROR_RT_DIE_SET_ERROR              = 507039; // single die mode can not set die
+static const int32_t ACL_ERROR_RT_INVALID_DIEID              = 507040; // invalid die id
+static const int32_t ACL_ERROR_RT_DIE_MODE_NOT_SET           = 507041; // die mode not set
 
 static const int32_t ACL_ERROR_RT_DRV_INTERNAL_ERROR         = 507899; // drv internal error
 static const int32_t ACL_ERROR_RT_AICPU_INTERNAL_ERROR       = 507900; // aicpu internal error
@@ -105,5 +113,4 @@ static const int32_t ACL_ERROR_RT_SOCKET_CLOSE               = 507901; // hdc di
 #ifdef __cplusplus
 }
 #endif
-
 #endif // __INC_EXTERNEL_RT_ERROR_CODES_H__
diff --git a/third_party/fwkacllib/inc/ops/array_ops.h b/third_party/fwkacllib/inc/ops/array_ops.h
index 18028c19..c02537cd 100644
--- a/third_party/fwkacllib/inc/ops/array_ops.h
+++ b/third_party/fwkacllib/inc/ops/array_ops.h
@@ -498,6 +498,25 @@ REG_OP(Constant)
     .OP_END_FACTORY_REG(Constant)
 
 /**
+*@brief Creates a file constant tensor, The operator is used to process the very large weight which is store in file. \n
+
+*@par Attributes:
+*file_id: A string, used to record file id. \n
+*shape: data shape. \n
+*dtype: data type. \n
+
+*@par Outputs:
+*y: The FileConstant tensor. \n
+*/
+REG_OP(FileConstant)
+    .OUTPUT(y, TensorType({DT_FLOAT, DT_FLOAT16, DT_INT8, DT_INT16, DT_UINT16, \
+        DT_UINT8, DT_INT32, DT_INT64, DT_UINT32, DT_UINT64, DT_BOOL, DT_DOUBLE}))
+    .REQUIRED_ATTR(file_id, String)
+    .REQUIRED_ATTR(shape, ListInt)
+    .REQUIRED_ATTR(dtype, Type)
+    .OP_END_FACTORY_REG(FileConstant)
+
+/**
 *@brief Returns a copy of the input tensor. \n
 
 *@par Inputs:
@@ -1330,31 +1349,6 @@ REG_OP(ExpandD)
     .OP_END_FACTORY_REG(ExpandD)
 
 /**
-* @brief Calculate buckets limit and offset. \n
-
-* @par Inputs:
-* Three inputs, including:
-* @li bucket_list: A 1-D tensor of type int32 with the value of ivf_counts and ivf_offset index. \n
-* @li ivf_counts: A 1-D tensor of type int32 with the value of ivf counts. \n
-* @li ivf_offset: A 1-D tensor of type int32 or int64 with the value of ivf offset. \n
-
-* @par Attributes:
-* total_limit: A int64 type maximum value of the sum of ivf_counts corresponding to bucket_list. \n
-
-* @par Outputs:
-* @li buckets_limit: A 1-D tensor of type int32 with the sum <= total_limit. \n
-* @li buckets_offset: A 1-D tensor of type int32 or int64 with the value of ivf_offset corresponding to bucket_list. \n
-*/
-REG_OP(CalcBucketsLimitAndOffset)
-    .INPUT(bucket_list, TensorType({DT_INT32}))
-    .INPUT(ivf_counts, TensorType({DT_INT32}))
-    .INPUT(ivf_offset, TensorType({DT_INT32, DT_INT64}))
-    .OUTPUT(buckets_limit, TensorType({DT_INT32}))
-    .OUTPUT(buckets_offset, TensorType({DT_INT32, DT_INT64}))
-    .REQUIRED_ATTR(total_limit, Int)
-    .OP_END_FACTORY_REG(CalcBucketsLimitAndOffset)
-
-/**
 *@brief Get dim number in tensordesc. \n
 
 *@par Inputs:
@@ -1362,6 +1356,9 @@ REG_OP(CalcBucketsLimitAndOffset)
 
 *@par Outputs:
 *y: A 1D tensor. The data type must be int32. \n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(GetShape)
     .DYNAMIC_INPUT(x, TensorType({DT_DOUBLE, DT_FLOAT, DT_FLOAT16, DT_INT8, DT_UINT8, DT_INT16, \
@@ -1377,8 +1374,13 @@ REG_OP(GetShape)
 
 *@par outputs:
 * y: a tensor_desc, type is int.\n
+
+*@par Restrictions:
+*Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
 */
 REG_OP(UpdateTensorDesc)
+    .INPUT(x, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8,
+                          DT_INT64, DT_UINT64, DT_INT16, DT_UINT16, DT_DOUBLE}))
     .OUTPUT(y, TensorType({DT_BOOL, DT_FLOAT16, DT_FLOAT, DT_INT8, DT_INT32, DT_UINT32, DT_UINT8,
                            DT_INT64, DT_UINT64, DT_INT16, DT_UINT16, DT_DOUBLE}))
     .REQUIRED_ATTR(shape, ListInt)
diff --git a/third_party/fwkacllib/inc/ops/image_ops.h b/third_party/fwkacllib/inc/ops/image_ops.h
index 319681e4..e771d67c 100644
--- a/third_party/fwkacllib/inc/ops/image_ops.h
+++ b/third_party/fwkacllib/inc/ops/image_ops.h
@@ -586,6 +586,14 @@ REG_OP(ResizeNearestNeighborV2GradD)
 channels], The image tensor that was resized . \n
 
 *@par Attributes:
+*@li size: An optional listint. Defaults to {}.
+*@par Attributes:
+*@li ori_image_size: An optional listint. Defaults to {}.
+*@par Attributes:
+*@li src_start_w: An optional int. Defaults to 0.
+*@par Attributes:
+*@li dst_start_w: An optional int. Defaults to 0.
+*@par Attributes:
 *@li align_corners: An optional bool. Defaults to False. If true, the centers of
 the 4 corner pixels of the input and grad tensors are aligned. Defaults to
 false .
@@ -606,6 +614,10 @@ REG_OP(ResizeBilinearV2Grad)
     .INPUT(grads, TensorType({DT_FLOAT}))
     .INPUT(original_image, TensorType::FloatingDataType())
     .OUTPUT(y, TensorType({DT_FLOAT}))
+    .ATTR(size, ListInt, {})
+    .ATTR(ori_image_size, ListInt, {})
+    .ATTR(src_start_w, Int, 0)
+    .ATTR(dst_start_w, Int, 0)
     .ATTR(align_corners, Bool, false)
     .ATTR(half_pixel_centers, Bool, false)
     .OP_END_FACTORY_REG(ResizeBilinearV2Grad)
@@ -624,7 +636,10 @@ size for the images . \n
 output tensors are aligned, preserving the values at the corner pixels.
 Defaults to false .
 * @li half_pixel_centers: An optional bool. Defaults to False . \n
-
+*@li ori_image_size: An optional listint. Defaults to {}.
+*@li split_size: An optional listint. Defaults to {}.
+*@li src_start_w: An optional int. Defaults to 0.
+*@li dst_start_w: An optional int. Defaults to 0.
 *@par Outputs:
 *y: 4-D with shape [batch, new_height, new_width, channels] . \n
 
@@ -640,6 +655,10 @@ REG_OP(ResizeBilinearV2)
                                DT_INT32, DT_INT64, DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
     .INPUT(size, TensorType({DT_INT32}))
     .OUTPUT(y, TensorType({DT_FLOAT}))
+    .ATTR(ori_image_size, ListInt, {})
+    .ATTR(split_size, ListInt, {})
+    .ATTR(src_start_w, Int, 0)
+    .ATTR(dst_start_w, Int, 0)
     .ATTR(align_corners, Bool, false)
     .ATTR(half_pixel_centers, Bool, false)
     .OP_END_FACTORY_REG(ResizeBilinearV2)
diff --git a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
index 66d67551..ccafa01f 100644
--- a/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
+++ b/third_party/fwkacllib/inc/ops/nn_batch_norm_ops.h
@@ -113,9 +113,7 @@ if input "x" is with format NC1HWC0. Specifies the mean of "x".
 Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x".
 *@li reserve_space_1: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
 Must be 5D if input "x" is with format NC1HWC0. Specifies the mean of "x" for gradient computation. Pass "None" to skip this output.
-*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW.
-Must be 5D if input "x" is with format NC1HWC0. Specifies the variance of "x" for gradient computation. Pass "None" to skip this output .
-*@li reserve_space_3: An optional Tensor of type float32. For compatibility with tensorflow, only has one useless element. \n
+*@li reserve_space_2: An optional Tensor of type float32. Must be 1D if input "x" is with format NHWC or NCHW. \n
 
 *@attention Constraints:
 *@li If the operation is used for inference and outputs "reserve_space_1" and "reserve_space_2" are available,
@@ -137,7 +135,6 @@ REG_OP(BatchNorm)
     .OUTPUT(batch_variance, TensorType({DT_FLOAT}))
     .OUTPUT(reserve_space_1, TensorType({DT_FLOAT}))
     .OUTPUT(reserve_space_2, TensorType({DT_FLOAT}))
-    .OUTPUT(reserve_space_3, TensorType({DT_FLOAT}))
     .ATTR(epsilon, Float, 0.0001)
     .ATTR(data_format, String, "NHWC")
     .ATTR(is_training, Bool, true)
@@ -167,6 +164,33 @@ REG_OP(SyncBatchNormBackwardReduce)
     .OP_END_FACTORY_REG(SyncBatchNormBackwardReduce)
 
 /**
+*@brief part of SyncBatchNormBackward . \n
+
+*@par Inputs:
+* Three inputs, including:
+*@li grad_output: A Tensor. Must be one of the following types: float16, float32 .
+*@li save_input: A Tensor. Must be one of the following types: float16, float32 .
+*@li mean: A Tensor. Must be one of the following types: float16, float32 .
+*@li invstd: A Tensor. Must be one of the following types: float16, float32 .
+*@li weight: A Tensor. Must be one of the following types: float16, float32 .
+*@li mean_dy: A Tensor. Must be one of the following types: float16, float32 .
+*@li mean_dy_xmu: A Tensor. Must be one of the following types: float16, float32 . \n
+
+*@par Outputs:
+*@li grad_input: A Tensor. Has the same type and format as input "grad_output" . \n
+*/
+REG_OP(SyncBatchNormBackwardElemt)
+    .INPUT(grad_output, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(save_input, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(mean, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(invstd, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(weight, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(mean_dy, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .INPUT(mean_dy_xmu, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OUTPUT(grad_input, TensorType({DT_FLOAT16, DT_FLOAT}))
+    .OP_END_FACTORY_REG(SyncBatchNormBackwardElemt)
+    
+/**
 *@brief Performs batch normalization . \n
 
 *@par Inputs:
@@ -285,8 +309,7 @@ REG_OP(BatchNormExt2)
 *@li x: A 4D or 5D Tensor of type float16 or float32, with format NHWC, NCHW, or NC1HWC0.
 *@li scale: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0.
 *@li reserve_space_1: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm.
-*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm .
-*@li reserve_space_3: A 1D optional Tensor of type float32. It is an output of BatchNorm . \n
+*@li reserve_space_2: A 4D or 5D Tensor of type float32, with format NHWC, NCHW, or NC1HWC0. It is an output of BatchNorm . \n
 
 *@par Attributes:
 *@li epsilon: An optional float32. Defaults to "0.0001". A small float number added to the variance of "x".
@@ -313,7 +336,6 @@ REG_OP(BatchNormGrad)
     .INPUT(scale, TensorType({DT_FLOAT}))
     .INPUT(reserve_space_1, TensorType({DT_FLOAT}))
     .INPUT(reserve_space_2, TensorType({DT_FLOAT}))
-    .OPTIONAL_INPUT(reserve_space_3, TensorType({DT_FLOAT}))
     .OUTPUT(x_backprop, TensorType({DT_FLOAT16,DT_FLOAT}))
     .OUTPUT(scale_backprop, TensorType({DT_FLOAT}))
     .OUTPUT(offset_backprop, TensorType({DT_FLOAT}))
diff --git a/third_party/fwkacllib/inc/ops/ocr_ops.h b/third_party/fwkacllib/inc/ops/ocr_ops.h
index 639c34de..baab5af2 100644
--- a/third_party/fwkacllib/inc/ops/ocr_ops.h
+++ b/third_party/fwkacllib/inc/ops/ocr_ops.h
@@ -128,7 +128,7 @@ REG_OP(OCRIdentifyPreHandle)
     .INPUT(imgs_offset, TensorType({DT_INT32}))
     .INPUT(imgs_size, TensorType({DT_INT32}))
     .OUTPUT(resized_imgs, TensorType({DT_UINT8}))
-    .ATTR(size, ListInt, {})
+    .REQUIRED_ATTR(size, ListInt)
     .ATTR(data_format, String, "NHWC")
     .OP_END_FACTORY_REG(OCRIdentifyPreHandle)
 
@@ -247,6 +247,7 @@ REG_OP(OCRDetectionPostHandle)
 *@li clipped_polys_data: A Tensor of type int32. point data of every clipped poly. \n
 *@li clipped_polys_offset: A Tensor of type int32. Offset of every clipped poly . \n
 *@li clipped_polys_size: A Tensor of type int32. Size of every clipped poly. \n
+*@li clipped_polys_num: A Tensor of type int32. Number of clipped polys. \n
 */
 REG_OP(ResizeAndClipPolys)
     .INPUT(polys_data, TensorType({DT_INT32}))
@@ -259,6 +260,7 @@ REG_OP(ResizeAndClipPolys)
     .OUTPUT(clipped_polys_data, TensorType({DT_INT32}))
     .OUTPUT(clipped_polys_offset, TensorType({DT_INT32}))
     .OUTPUT(clipped_polys_size, TensorType({DT_INT32}))
+    .OUTPUT(clipped_polys_num, TensorType({DT_INT32}))
     .OP_END_FACTORY_REG(ResizeAndClipPolys);
 
 
diff --git a/third_party/fwkacllib/inc/ops/reduce_ops.h b/third_party/fwkacllib/inc/ops/reduce_ops.h
index e8c14b1a..4e4c74af 100644
--- a/third_party/fwkacllib/inc/ops/reduce_ops.h
+++ b/third_party/fwkacllib/inc/ops/reduce_ops.h
@@ -1305,6 +1305,27 @@ REG_OP(ReduceStdWithMean)
     .ATTR(invert, Bool, false)
     .ATTR(epsilon, Float, 0.001)
     .OP_END_FACTORY_REG(ReduceStdWithMean)
+
+/**
+*@brief Performs reduced batch normalization . \n
+
+*@par Inputs:
+*x: A 5D Tensor of type float16 or float32, with format NC1HWC0 . \n
+
+*@par Outputs:
+*@li mean: A Tensor of type float32 for SUM reduced "x".
+*@li variance: A Tensor of type float32 for square sum reduced "x" . \n
+
+*@par Restrictions:
+* Warning: THIS FUNCTION IS EXPERIMENTAL.  Please do not use.
+*/
+REG_OP(ReduceMeanVariance)
+    .INPUT(x, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(mean, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .OUTPUT(variance, TensorType({DT_FLOAT16,DT_FLOAT}))
+    .ATTR(axes, ListInt, {})
+    .ATTR(keep_dims, Bool, true)
+    .OP_END_FACTORY_REG(ReduceMeanVariance)
 } //namespace ge
 
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_REDUCE_OPS_H_
diff --git a/third_party/fwkacllib/inc/ops/vector_search.h b/third_party/fwkacllib/inc/ops/vector_search.h
index 8c1d0a2e..8f2201af 100644
--- a/third_party/fwkacllib/inc/ops/vector_search.h
+++ b/third_party/fwkacllib/inc/ops/vector_search.h
@@ -78,8 +78,8 @@ REG_OP(TopKPQDistance)
     .OUTPUT(topk_ivf, TensorType({DT_INT32}))
     .OUTPUT(topk_index, TensorType({DT_INT32}))
     .ATTR(order, String, "ASC")
-    .ATTR(k, Int, 0)
-    .ATTR(group_size, Int, 0)
+    .REQUIRED_ATTR(k, Int)
+    .REQUIRED_ATTR(group_size, Int)
     .OP_END_FACTORY_REG(TopKPQDistance)
 
 /**
@@ -129,6 +129,68 @@ REG_OP(ScanPQCodes)
     .ATTR(split_count, Int, 1)
     .ATTR(split_index, Int, 0)
     .OP_END_FACTORY_REG(ScanPQCodes)
+
+/**
+* @brief Calculate buckets limit and offset. \n
+
+* @par Inputs:
+* Three inputs, including:
+* @li bucket_list: A 1-D tensor of type int32 with the value of ivf_counts and ivf_offset index. \n
+* @li ivf_counts: A 1-D tensor of type int32 with the value of ivf counts. \n
+* @li ivf_offset: A 1-D tensor of type int32 or int64 with the value of ivf offset. \n
+
+* @par Attributes:
+* total_limit: A int64 type maximum value of the sum of ivf_counts corresponding to bucket_list. \n
+
+* @par Outputs:
+* @li buckets_limit: A 1-D tensor of type int32 with the sum <= total_limit. \n
+* @li buckets_offset: A 1-D tensor of type int32 or int64 with the value of ivf_offset corresponding to bucket_list. \n
+*/
+REG_OP(CalcBucketsLimitAndOffset)
+    .INPUT(bucket_list, TensorType({DT_INT32}))
+    .INPUT(ivf_counts, TensorType({DT_INT32}))
+    .INPUT(ivf_offset, TensorType({DT_INT32, DT_INT64}))
+    .OUTPUT(buckets_limit, TensorType({DT_INT32}))
+    .OUTPUT(buckets_offset, TensorType({DT_INT32, DT_INT64}))
+    .REQUIRED_ATTR(total_limit, Int)
+    .OP_END_FACTORY_REG(CalcBucketsLimitAndOffset)
+
+/**
+* @brief Calculate ProdVirialSeA. \n
+*
+* @par Inputs:
+* Five inputs, including:
+* @li net_deriv: A Tensor. Must be one of the following types: float16, float32, float64.
+* @li in_deriv: A Tensor. Must be one of the following types: float16, float32, float64.
+* @li rij: A Tensor. Must be one of the following types: float16, float32, float64.
+* @li nlist: A Tensor. dtype is int32.
+* @li natoms: A Tensor. dtype is int32. \n
+*
+* @par Outputs:
+* Two outputs, including:
+* @li virial: A Tensor. Must be one of the following types: float16, float32, float64.
+* @li atom_virial: A Tensor. Must be one of the following types: float16, float32, float64. \n
+*
+* @par Attributes:
+* Two attributes, including:
+* @li n_a_sel: A Scalar.
+* @li n_r_sel: A Scalar. \n
+*
+* @par Restrictions:
+* Warning: THIS FUNCTION IS EXPERIMENTAL. Please do not use.
+*/
+REG_OP(ProdVirialSeA)
+    .INPUT(net_deriv, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .INPUT(in_deriv, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .INPUT(rij, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .INPUT(nlist, TensorType({DT_INT32}))
+    .INPUT(natoms, TensorType({DT_INT32}))
+    .OUTPUT(virial, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .OUTPUT(atom_virial, TensorType({DT_FLOAT16, DT_FLOAT, DT_DOUBLE}))
+    .REQUIRED_ATTR(n_a_sel, Int)
+    .REQUIRED_ATTR(n_r_sel, Int)
+    .ATTR(nall, Int, 28328)
+    .OP_END_FACTORY_REG(ProdVirialSeA)
 } // namespace ge
 
 #endif  // OPS_BUILT_IN_OP_PROTO_INC_VECTOR_SEARCH_H_
diff --git a/third_party/fwkacllib/inc/runtime/base.h b/third_party/fwkacllib/inc/runtime/base.h
index 9e95a8b9..39301554 100644
--- a/third_party/fwkacllib/inc/runtime/base.h
+++ b/third_party/fwkacllib/inc/runtime/base.h
@@ -14,8 +14,8 @@
  * limitations under the License.
  */
 
-#ifndef __CCE_RUNTIME_BASE_H__
-#define __CCE_RUNTIME_BASE_H__
+#ifndef CCE_RUNTIME_BASE_H
+#define CCE_RUNTIME_BASE_H
 
 #include <stdint.h>
 #include "toolchain/prof_callback.h"
@@ -443,4 +443,4 @@ RTS_API rtError_t rtGetTaskIdAndStreamID(uint32_t *taskId, uint32_t *streamId);
 }
 #endif
 
-#endif  // __CCE_RUNTIME_BASE_H__
+#endif  // CCE_RUNTIME_BASE_H
\ No newline at end of file
diff --git a/third_party/fwkacllib/inc/runtime/config.h b/third_party/fwkacllib/inc/runtime/config.h
index 64ab1497..f9e6a49e 100644
--- a/third_party/fwkacllib/inc/runtime/config.h
+++ b/third_party/fwkacllib/inc/runtime/config.h
@@ -239,8 +239,18 @@ RTS_API rtError_t rtSetOpWaitTimeOut(uint32_t timeout);
  */
 RTS_API rtError_t rtSetOpExecuteTimeOut(uint32_t timeout);
 
+/**
+ * @ingroup
+ * @brief get is Heterogenous.
+ * @param [out] heterogenous=1 Heterogenous Mode: read isHeterogenous=1 in ini file.
+ * @param [out] heterogenous=0 NOT Heterogenous Mode:
+ *      1:not found ini file, 2:error when reading ini, 3:Heterogenous value is not 1
+ * @return RT_ERROR_NONE for ok
+ */
+RTS_API rtError_t rtGetIsHeterogenous(int32_t *heterogenous);
+
 #if defined(__cplusplus)
 }
 #endif
 
-#endif // CCE_RUNTIME_CONFIG_H
\ No newline at end of file
+#endif // CCE_RUNTIME_CONFIG_H
diff --git a/third_party/fwkacllib/inc/runtime/dev.h b/third_party/fwkacllib/inc/runtime/dev.h
index f6777262..75d01f36 100644
--- a/third_party/fwkacllib/inc/runtime/dev.h
+++ b/third_party/fwkacllib/inc/runtime/dev.h
@@ -25,7 +25,7 @@ extern "C" {
 
 #define RT_CAPABILITY_SUPPORT     (0x1U)
 #define RT_CAPABILITY_NOT_SUPPORT (0x0U)
-#define MEMORY_INFO_TS_4G_LIMITED (0x0) // for compatibility
+#define MEMORY_INFO_TS_4G_LIMITED (0x0U) // for compatibility
 
 typedef struct tagRTDeviceInfo {
     uint8_t env_type;  // 0: FPGA  1: EMU 2: ESL
@@ -173,6 +173,15 @@ RTS_API rtError_t rtSetDeviceV2(int32_t device, rtDeviceMode deviceMode);
 
 /**
  * @ingroup dvrt_dev
+ * @brief get deviceMode
+ * @param [out] deviceMode   the device mode
+ * @return RT_ERROR_NONE for ok
+ * @return RT_ERROR_INVALID_VALUE for error input
+ */
+RTS_API rtError_t rtGetDeviceMode(rtDeviceMode *deviceMode);
+
+/**
+ * @ingroup dvrt_dev
  * @brief set target die for current thread
  * @param [int] die   the die id
  * @return RT_ERROR_NONE for ok
diff --git a/third_party/fwkacllib/inc/runtime/kernel.h b/third_party/fwkacllib/inc/runtime/kernel.h
index 8c556e3a..2bd7f284 100644
--- a/third_party/fwkacllib/inc/runtime/kernel.h
+++ b/third_party/fwkacllib/inc/runtime/kernel.h
@@ -133,8 +133,11 @@ typedef struct tagRtArgsWithTiling {
     uint16_t tilingDataOffset;      // tiling data offset
     uint16_t hostInputAddrOffset;   // index of host_memory input in inputs_addrs list
     uint16_t hostInputDataOffset;   // host_mem input data offset
-    bool hasHostMemInput;           // has host_memory input data in args or not: ture or false
-    uint8_t reserved[7];
+    uint8_t hasHostMemInput;        // has host_memory input data in args or not: 0 means no host_memory input data,
+                                    // others means has host_memory input data.
+    uint8_t isNoNeedH2DCopy;        // is no need host to device copy: 0 means need H2D copy,
+                                    // others means doesn't need H2D copy.
+    uint8_t reserved[6];
 } rtArgsWithTiling_t;
 
 /**
@@ -299,8 +302,8 @@ RTS_API rtError_t rtDependencyRegister(void *mHandle, void *sHandle);
  * @return RT_ERROR_NONE for ok
  * @return RT_ERROR_INVALID_VALUE for error input
  */
-RTS_API rtError_t rtFunctionRegister(void *binHandle, const void *stubFunc, const char_t *stubName, const void *devFunc,
-                                     uint32_t funcMode);
+RTS_API rtError_t rtFunctionRegister(void *binHandle, const void *stubFunc, const char_t *stubName,
+                                     const void *devFunc, uint32_t funcMode);
 
 /**
  * @ingroup rt_kernel
@@ -371,8 +374,9 @@ RTS_API rtError_t rtKernelLaunch(const void *stubFunc, uint32_t blockDim, void *
  * @return RT_ERROR_NONE for ok
  * @return RT_ERROR_INVALID_VALUE for error input
  */
-RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim, void *args, uint32_t argsSize,
-                                            rtSmDesc_t *smDesc, rtStream_t stream_, const void *kernelInfo);
+RTS_API rtError_t rtKernelLaunchWithHandle(void *handle, const void *devFunc, uint32_t blockDim,
+                                           void *args, uint32_t argsSize, rtSmDesc_t *smDesc, rtStream_t stream_,
+                                           const void *kernelInfo);
 
 /**
  * @ingroup rt_kernel
diff --git a/third_party/fwkacllib/inc/runtime/mem.h b/third_party/fwkacllib/inc/runtime/mem.h
index 971f0cb0..d095ef0c 100644
--- a/third_party/fwkacllib/inc/runtime/mem.h
+++ b/third_party/fwkacllib/inc/runtime/mem.h
@@ -576,7 +576,7 @@ RTS_API rtError_t rtRDMASend(uint32_t index, uint32_t wqeIndex, rtStream_t strea
  * @return RT_ERROR_INVALID_VALUE for error input
  * @return RT_ERROR_DRV_ERR for driver error
  */
-RTS_API rtError_t rtSetIpcMemPid(const char_t *name, int32_t pid[], int num);
+RTS_API rtError_t rtSetIpcMemPid(const char_t *name, int32_t pid[], int32_t num);
 
 /**
  * @ingroup dvrt_mem
diff --git a/third_party/fwkacllib/inc/runtime/rt.h b/third_party/fwkacllib/inc/runtime/rt.h
index 519ccd40..8c3e339f 100644
--- a/third_party/fwkacllib/inc/runtime/rt.h
+++ b/third_party/fwkacllib/inc/runtime/rt.h
@@ -31,5 +31,6 @@
 #include "rt_ffts.h"
 #include "rt_ffts_plus.h"
 #include "rt_dfx.h"
+#include "rt_mem_queue.h"
 
-#endif  // CCE_RUNTIME_RT_H
\ No newline at end of file
+#endif  // CCE_RUNTIME_RT_H
diff --git a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
index 0bc63385..80f4baab 100644
--- a/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
+++ b/third_party/fwkacllib/inc/toolchain/prof_acl_api.h
@@ -23,6 +23,8 @@
 #define PROF_AICORE_METRICS         0x00000004
 #define PROF_AICPU_TRACE            0x00000008
 #define PROF_L2CACHE                0x00000010
+#define PROF_HCCL_TRACE             0x00000020
+#define PROF_TRAINING_TRACE         0x00000040
 
 // system profilinig switch
 #define PROF_CPU                    0x00010000
@@ -41,10 +43,7 @@
 #define PROF_AIVECTORCORE_METRICS   0x0000020000000
 #define PROF_SUBTASK_TIME           0x0000040000000
 
-#define PROF_TRAINING_TRACE         0x0000080000000
-#define PROF_HCCL_TRACE             0x0000100000000
-
-#define PROF_TASK_TRACE             0x0000185000002
+#define PROF_TASK_TRACE             0x0000005000062
 
 #define PROF_MODEL_LOAD             0x8000000000000000
 
@@ -54,6 +53,8 @@
 #define PROF_AICORE_METRICS_MASK         0x00000004
 #define PROF_AICPU_TRACE_MASK            0x00000008
 #define PROF_L2CACHE_MASK                0x00000010
+#define PROF_HCCL_TRACE_MASK             0x00000020
+#define PROF_TRAINING_TRACE_MASK         0x00000040
 
 // system profilinig mask
 #define PROF_CPU_MASK                    0x00010000
@@ -72,9 +73,6 @@
 #define PROF_AIVECTORCORE_METRICS_MASK   0x0000020000000
 #define PROF_SUBTASK_TIME_MASK           0x0000040000000
 
-#define PROF_TRAINING_TRACE_MASK         0x0000080000000
-#define PROF_HCCL_TRACE_MASK             0x0000100000000
-
 #define PROF_MODEL_LOAD_MASK             0x8000000000000000
 
 #if (defined(_WIN32) || defined(_WIN64) || defined(_MSC_VER))