Synchronize with latest Ascend software suite 17 Jun 2020

5 years ago · d4b7f64386
--- a/inc/common/blocking_queue.h
+++ b/inc/common/blocking_queue.h
@@ -42,7 +42,7 @@ class BlockingQueue {
      return false;
    }

    item = queue_.front();
    item = std::move(queue_.front());
    queue_.pop_front();

    full_cond_.notify_one();
@@ -71,6 +71,27 @@ class BlockingQueue {
    return true;
  }

  bool Push(T &&item, bool is_wait = true) {
    std::unique_lock<std::mutex> lock(mutex_);

    while (queue_.size() >= max_size_ && !is_stoped_) {
      if (!is_wait) {
        return false;
      }
      full_cond_.wait(lock);
    }

    if (is_stoped_) {
      return false;
    }

    queue_.emplace_back(std::move(item));

    empty_cond_.notify_one();

    return true;
  }

  void Stop() {
    {
      std::unique_lock<std::mutex> lock(mutex_);
--- a/inc/common/opskernel/ge_task_info.h
+++ b/inc/common/opskernel/ge_task_info.h
@@ -26,6 +26,7 @@ using std::string;
 namespace ge {
 // when need to eliminate GETaskKernelHcclInfo, so not need DAVINCI_TRAIN/DAVINCI_CLOUD
 struct GETaskKernelHcclInfo {
  string input_name;
  string hccl_type;
  void *inputDataAddr;
  void *outputDataAddr;
@@ -35,6 +36,7 @@ struct GETaskKernelHcclInfo {
  int32_t opType;
  int64_t rootId;
  uint64_t workSpaceMemSize;
  std::vector<int64_t> dims;
  std::vector<rtStream_t> hcclStreamList;
 };

@@ -48,7 +50,7 @@ struct GETaskInfo {
  uint32_t privateDefLen;
  void *opsKernelStorePtr;

  GETaskKernelHcclInfo kernelHcclInfo;
  std::vector<GETaskKernelHcclInfo> kernelHcclInfo;
 };
 }  // namespace ge
 #endif  // INC_COMMON_OPSKERNEL_GE_TASK_INFO_H_
--- a/inc/common/opskernel/ops_kernel_info_store.h
+++ b/inc/common/opskernel/ops_kernel_info_store.h
@@ -73,7 +73,7 @@ class OpsKernelInfoStore {

  // only call fe engine interface to compile single op
  virtual Status CompileOp(vector<ge::NodePtr> &node_vec) { return SUCCESS; }

  virtual Status CompileOpRun(vector<ge::NodePtr> &node_vec) { return SUCCESS; }
  // load task for op
  virtual Status LoadTask(GETaskInfo &task) { return SUCCESS; }

--- a/inc/external/ge/ge_api_types.h
+++ b/inc/external/ge/ge_api_types.h
@@ -33,6 +33,7 @@ const char *const OPTION_EXEC_SESSION_ID = "ge.exec.sessionId";
 const char *const OPTION_EXEC_DEVICE_ID = "ge.exec.deviceId";
 const char *const OPTION_EXEC_JOB_ID = "ge.exec.jobId";
 const char *const OPTION_EXEC_IS_USEHCOM = "ge.exec.isUseHcom";
 const char *const OPTION_EXEC_IS_USEHVD = "ge.exec.isUseHvd";
 const char *const OPTION_EXEC_RANK_ID = "ge.exec.rankId";
 const char *const OPTION_EXEC_POD_NAME = "ge.exec.podName";
 const char *const OPTION_EXEC_DEPLOY_MODE = "ge.exec.deployMode";
@@ -52,6 +53,7 @@ const char *const OPTION_EXEC_PROFILING_OPTIONS = "ge.exec.profilingOptions";
 const char *const OPTION_EXEC_HCCL_FLAG = "ge.exec.hcclFlag";
 const char *const OPTION_EXEC_ATOMIC_FLAG = "ge.exec.enable_atomic";
 const char *const OPTION_EXEC_DISABLE_REUSED_MEMORY = "ge.exec.disableReuseMemory";
 const char *const OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION = "ge.exec.isTailingOptimization";

 // Option key: memory init
 const char *const GRAPH_MEMORY_MAX_SIZE = "ge.graphMemoryMaxSize";
@@ -153,7 +155,7 @@ const std::string STREAM_MAX_PARALLEL_NUM = "ge.streamMaxParallelNum";
 const std::string OUTPUT_DATATYPE = "ge.outputDatatype";

 // congigure opSelectImplmode to setting op select implmode
 const std::string kOpSelectImplmode = "ge.opSelectImplmode";
 const std::string OP_SELECT_IMPL_MODE = "ge.opSelectImplmode";

 // configure whether to enable hcom parallel by session constructor options param,
 // its value should be "0" or "1", default value is "0"
@@ -214,6 +216,9 @@ const char *const ENABLE_PRINT_OP_PASS = "ge.enablePrintOpPass";
 // Its value should be "true" or "false", default value is "false"
 const char *const ENABLE_SINGLE_STREAM = "ge.enableSingleStream";

 // Configure input fp16 nodes
 const std::string INPUT_FP16_NODES = "ge.INPUT_NODES_SET_FP16";

 // Graph run mode
 enum GraphRunMode { PREDICTION = 0, TRAIN };

@@ -263,14 +268,37 @@ static const char *const AUTO_TUNE_MODE = ge::AUTO_TUNE_MODE.c_str();
 static const char *const CORE_TYPE = ge::CORE_TYPE.c_str();
 static const char *const SOC_VERSION = ge::SOC_VERSION.c_str();
 static const char *const ENABLE_SINGLE_STREAM = ge::ENABLE_SINGLE_STREAM;
 static const char *const AICORE_NUM = ge::AICORE_NUM.c_str();
 static const char *const FUSION_SWITCH_FILE = ge::FUSION_SWITCH_FILE.c_str();
 static const char *const ENABLE_SMALL_CHANNEL = ge::ENABLE_SMALL_CHANNEL.c_str();
 static const char *const QUANT_OPTIMIZE = ge::QUANT_OPTIMIZE.c_str();
 static const char *const OP_SELECT_IMPL_MODE = ge::OP_SELECT_IMPL_MODE.c_str();
 static const char *const OUTPUT_TYPE = ge::OUTPUT_DATATYPE.c_str();
 static const char *const BUFFER_OPTIMIZE = ge::BUFFER_OPTIMIZE.c_str();
 static const char *const ENABLE_COMPRESS_WEIGHT = ge::ENABLE_COMPRESS_WEIGHT.c_str();
 static const char *const COMPRESS_WEIGHT_CONF = "compress_weight_conf";
 static const char *const OUT_NODES = ge::OUTPUT_NODE_NAME.c_str();
 static const char *const INPUT_FP16_NODES = ge::INPUT_FP16_NODES.c_str();
 static const char *const LOG_LEVEL = "log";

 // for interface: aclgrphBuildModel
 const std::set<std::string> ir_builder_suppported_options = {INPUT_FORMAT, INPUT_SHAPE, DYNAMIC_BATCH_SIZE,
                                                             DYNAMIC_IMAGE_SIZE, INSERT_OP_FILE};
 const std::set<std::string> ir_builder_suppported_options = {
  INPUT_FORMAT,         INPUT_SHAPE, DYNAMIC_BATCH_SIZE, DYNAMIC_IMAGE_SIZE,
  INSERT_OP_FILE,       OUTPUT_TYPE, BUFFER_OPTIMIZE,    ENABLE_COMPRESS_WEIGHT,
  COMPRESS_WEIGHT_CONF, OUT_NODES,   INPUT_FP16_NODES,   LOG_LEVEL};
 // for interface: aclgrphBuildInitialize
 const std::set<std::string> global_options = {
  HEAD_STREAM,    CORE_TYPE,           SOC_VERSION, PRECISION_MODE, EXEC_DISABLE_REUSED_MEMORY,
  AUTO_TUNE_MODE, ENABLE_SINGLE_STREAM};
 const std::set<std::string> global_options = {HEAD_STREAM,
                                              CORE_TYPE,
                                              SOC_VERSION,
                                              PRECISION_MODE,
                                              EXEC_DISABLE_REUSED_MEMORY,
                                              AUTO_TUNE_MODE,
                                              ENABLE_SINGLE_STREAM,
                                              AICORE_NUM,
                                              FUSION_SWITCH_FILE,
                                              ENABLE_SMALL_CHANNEL,
                                              QUANT_OPTIMIZE,
                                              OP_SELECT_IMPL_MODE};
 }  // namespace ir_option
 }  // namespace ge

--- a/inc/external/graph/operator.h
+++ b/inc/external/graph/operator.h
@@ -48,12 +48,9 @@ class NamedAttrs;
 class Graph;
 class AttrValue;

 using SubgraphBuilder = std::function<Graph(const std::string &name)>;
 using SubgraphBuilder = std::function<Graph()>;
 using OperatorImplPtr = std::shared_ptr<OperatorImpl>;

 class Graph;
 using GraphBuilderCallback = std::function<Graph()>;

 class OpIO;
 using OutHandler = std::shared_ptr<OpIO>;
 using InHandler = std::shared_ptr<OpIO>;
@@ -139,12 +136,6 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {
  void SetInferenceContext(const InferenceContextPtr &inference_context);
  InferenceContextPtr GetInferenceContext() const;

  void SetGraphBuilder(const GraphBuilderCallback &builder);
  graphStatus GetGraphBuilder(GraphBuilderCallback &builder) const;

  void AddSubgraphName(const string &name);
  string GetSubgraphName(int index) const;

  graphStatus VerifyAllAttr(bool disable_common_verifier = false);

  size_t GetInputsSize() const;
@@ -265,9 +256,9 @@ class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY Operator {

  Operator &SetInput(const string &dst_name, uint32_t dst_index, const Operator &src_oprt, const string &name);

  void SubgraphRegister(const std::string &name, bool dynamic);
  void SubgraphCountRegister(const std::string &name, uint32_t count);
  void SetSubgraphBuilder(const std::string &name, uint32_t index, const SubgraphBuilder &builder);
  void SubgraphRegister(const string &ir_name, bool dynamic);
  void SubgraphCountRegister(const string &ir_name, uint32_t count);
  void SetSubgraphBuilder(const string &ir_name, uint32_t index, const SubgraphBuilder &builder);

 private:
  Operator &SetInput(const string &dst_name, const OutHandler &out_handler);
--- a/inc/external/graph/operator_reg.h
+++ b/inc/external/graph/operator_reg.h
@@ -186,56 +186,54 @@ class OpReg {
    Operator::OutputRegister(#x);                                                \
    (void)OpReg()

 #define DYNAMIC_INPUT(x, t)                                                                                            \
  N();                                                                                                                 \
  __dy_input_##x();                                                                                                    \
  }                                                                                                                    \
                                                                                                                       \
 public:                                                                                                               \
  _THIS_TYPE &create_dynamic_input_##x(unsigned int num, bool isPushBack = true) {                                     \
    Operator::DynamicInputRegister(#x, num, isPushBack);                                                               \
    return *this;                                                                                                      \
  }                                                                                                                    \
  _THIS_TYPE &create_dynamic_input_byindex_##x(unsigned int num, size_t index) {                                       \
    Operator::DynamicInputRegisterByIndex(#x, num, index);                                                             \
    return *this;                                                                                                      \
  }                                                                                                                    \
  TensorDesc get_dynamic_input_desc_##x(unsigned int index) const { return Operator::GetDynamicInputDesc(#x, index); } \
  graphStatus update_dynamic_input_desc_##x(unsigned int index, const TensorDesc &tensorDesc) {                        \
    return Operator::UpdateDynamicInputDesc(#x, index, tensorDesc);                                                    \
  }                                                                                                                    \
  _THIS_TYPE &set_dynamic_input_##x(unsigned int dstIndex, Operator &v) {                                              \
    Operator::SetInput(#x, dstIndex, v);                                                                               \
    return *this;                                                                                                      \
  }                                                                                                                    \
  _THIS_TYPE &set_dynamic_input_##x(unsigned int dstIndex, Operator &v, const string &srcName) {                       \
    Operator::SetInput(#x, dstIndex, v, srcName);                                                                      \
    return *this;                                                                                                      \
  }                                                                                                                    \
                                                                                                                       \
 private:                                                                                                              \
  void __dy_input_##x() {                                                                                              \
 #define DYNAMIC_INPUT(x, t)                                                                                        \
  N();                                                                                                             \
  __dy_input_##x();                                                                                                \
  }                                                                                                                \
                                                                                                                   \
 public:                                                                                                           \
  _THIS_TYPE &create_dynamic_input_##x(uint32_t num, bool isPushBack = true) {                                     \
    Operator::DynamicInputRegister(#x, num, isPushBack);                                                           \
    return *this;                                                                                                  \
  }                                                                                                                \
  _THIS_TYPE &create_dynamic_input_byindex_##x(uint32_t num, size_t index) {                                       \
    Operator::DynamicInputRegisterByIndex(#x, num, index);                                                         \
    return *this;                                                                                                  \
  }                                                                                                                \
  TensorDesc get_dynamic_input_desc_##x(uint32_t index) const { return Operator::GetDynamicInputDesc(#x, index); } \
  graphStatus update_dynamic_input_desc_##x(uint32_t index, const TensorDesc &tensorDesc) {                        \
    return Operator::UpdateDynamicInputDesc(#x, index, tensorDesc);                                                \
  }                                                                                                                \
  _THIS_TYPE &set_dynamic_input_##x(uint32_t dstIndex, Operator &v) {                                              \
    Operator::SetInput(#x, dstIndex, v);                                                                           \
    return *this;                                                                                                  \
  }                                                                                                                \
  _THIS_TYPE &set_dynamic_input_##x(uint32_t dstIndex, Operator &v, const string &srcName) {                       \
    Operator::SetInput(#x, dstIndex, v, srcName);                                                                  \
    return *this;                                                                                                  \
  }                                                                                                                \
                                                                                                                   \
 private:                                                                                                          \
  void __dy_input_##x() {                                                                                          \
    (void)OpReg()

 #define DYNAMIC_OUTPUT(x, t)                                                                     \
  N();                                                                                           \
  __dy_output_##x();                                                                             \
  }                                                                                              \
                                                                                                 \
 public:                                                                                         \
  _THIS_TYPE &create_dynamic_output_##x(unsigned int num, bool isPushBack = true) {              \
    Operator::DynamicOutputRegister(#x, num, isPushBack);                                        \
    return *this;                                                                                \
  }                                                                                              \
  TensorDesc get_dynamic_output_desc_##x(unsigned int index) const {                             \
    return Operator::GetDynamicOutputDesc(#x, index);                                            \
  }                                                                                              \
  graphStatus update_dynamic_output_desc_##x(unsigned int index, const TensorDesc &tensorDesc) { \
    return Operator::UpdateDynamicOutputDesc(#x, index, tensorDesc);                             \
  }                                                                                              \
                                                                                                 \
 private:                                                                                        \
  void __dy_output_##x() {                                                                       \
 #define DYNAMIC_OUTPUT(x, t)                                                                                         \
  N();                                                                                                               \
  __dy_output_##x();                                                                                                 \
  }                                                                                                                  \
                                                                                                                     \
 public:                                                                                                             \
  _THIS_TYPE &create_dynamic_output_##x(uint32_t num, bool isPushBack = true) {                                      \
    Operator::DynamicOutputRegister(#x, num, isPushBack);                                                            \
    return *this;                                                                                                    \
  }                                                                                                                  \
  TensorDesc get_dynamic_output_desc_##x(uint32_t index) const { return Operator::GetDynamicOutputDesc(#x, index); } \
  graphStatus update_dynamic_output_desc_##x(uint32_t index, const TensorDesc &tensorDesc) {                         \
    return Operator::UpdateDynamicOutputDesc(#x, index, tensorDesc);                                                 \
  }                                                                                                                  \
                                                                                                                     \
 private:                                                                                                            \
  void __dy_output_##x() {                                                                                           \
    (void)OpReg()

 #define GRAPH(x)                                                                                \
@@ -258,29 +256,29 @@ class OpReg {
    Operator::SubgraphCountRegister(#x, 1);                                                     \
    (void)OpReg()

 #define DYNAMIC_GRAPH(x)                                                                                       \
  N();                                                                                                         \
  __graph_##x();                                                                                               \
  }                                                                                                            \
                                                                                                               \
 public:                                                                                                       \
  static const string name_graph_##x() { return #x; }                                                          \
  _THIS_TYPE &create_dynamic_subgraph_##x(unsigned int num) {                                                  \
    Operator::SubgraphCountRegister(#x, num);                                                                  \
    return *this;                                                                                              \
  }                                                                                                            \
  SubgraphBuilder get_dynamic_subgraph_builder_##x(unsigned int index) const {                                 \
    return Operator::GetDynamicSubgraphBuilder(#x, index);                                                     \
  }                                                                                                            \
  Graph get_dynamic_subgraph_##x(unsigned int index) const { return Operator::GetDynamicSubgraph(#x, index); } \
  _THIS_TYPE &set_dynamic_subgraph_builder_##x(unsigned int index, const SubgraphBuilder &v) {                 \
    Operator::SetSubgraphBuilder(#x, index, v);                                                                \
    return *this;                                                                                              \
  }                                                                                                            \
                                                                                                               \
 private:                                                                                                      \
  void __graph_##x() {                                                                                         \
    Operator::SubgraphRegister(#x, true);                                                                      \
 #define DYNAMIC_GRAPH(x)                                                                                   \
  N();                                                                                                     \
  __graph_##x();                                                                                           \
  }                                                                                                        \
                                                                                                           \
 public:                                                                                                   \
  static const string name_graph_##x() { return #x; }                                                      \
  _THIS_TYPE &create_dynamic_subgraph_##x(uint32_t num) {                                                  \
    Operator::SubgraphCountRegister(#x, num);                                                              \
    return *this;                                                                                          \
  }                                                                                                        \
  SubgraphBuilder get_dynamic_subgraph_builder_##x(uint32_t index) const {                                 \
    return Operator::GetDynamicSubgraphBuilder(#x, index);                                                 \
  }                                                                                                        \
  Graph get_dynamic_subgraph_##x(uint32_t index) const { return Operator::GetDynamicSubgraph(#x, index); } \
  _THIS_TYPE &set_dynamic_subgraph_builder_##x(uint32_t index, const SubgraphBuilder &v) {                 \
    Operator::SetSubgraphBuilder(#x, index, v);                                                            \
    return *this;                                                                                          \
  }                                                                                                        \
                                                                                                           \
 private:                                                                                                  \
  void __graph_##x() {                                                                                     \
    Operator::SubgraphRegister(#x, true);                                                                  \
    (void)OpReg()

 #define PASTE(g_register, y) g_register##y
--- a/inc/external/graph/types.h
+++ b/inc/external/graph/types.h
@@ -24,7 +24,7 @@
 namespace ge {
 static const int64_t UNKNOWN_DIM = -1;
 static const int64_t UNKNOWN_DIM_NUM = -2;
 static const std::vector<int64_t> UNKNOWN_SHAPE = {0};
 static const std::vector<int64_t> UNKNOWN_SHAPE = {-1};
 static const std::vector<int64_t> UNKNOWN_RANK = {-2};

 #ifdef HOST_VISIBILITY
--- a/inc/framework/common/ge_types.h
+++ b/inc/framework/common/ge_types.h
@@ -40,6 +40,14 @@ enum FrameworkType {
  FMK_TYPE_RESERVED,
 };

 enum OpEngineType {
  ENGINE_SYS = 0,  // default engine
  ENGINE_AICORE = 1,
  ENGINE_VECTOR = 2,
  ENGINE_AICUBE = 3,   // not support
  ENGINE_AIVECTOR = 4  // not support
 };

 const char *const GE_ENGINE_ATTR_MEM_TYPE_HBM = "HBM";

 // Data cache, including data address and length
@@ -141,6 +149,7 @@ struct Options {
  int32_t device_id;
  std::string job_id;
  bool isUseHcom;
  bool isUseHvd;
  bool deployMode;
  bool isAICPUMode;
  bool enable_atomic;
--- a/inc/framework/common/types.h
+++ b/inc/framework/common/types.h
@@ -442,6 +442,7 @@ REGISTER_OPTYPE_DECLARE(STREAMMERGE, "StreamMerge");
 REGISTER_OPTYPE_DECLARE(ENDGRAPH, "EndGraph");
 REGISTER_OPTYPE_DECLARE(SEND, "Send");
 REGISTER_OPTYPE_DECLARE(RECV, "Recv");
 REGISTER_OPTYPE_DECLARE(ENDOFSEQUENCE, "EndOfSequence");

 REGISTER_OPTYPE_DECLARE(LABELSET, "LabelSet");
 REGISTER_OPTYPE_DECLARE(LABELGOTO, "LabelGoto");
@@ -508,6 +509,12 @@ REGISTER_OPTYPE_DECLARE(DEPTHWISEWEIGHT6D24D, "depthwise_weight_6d_2_4d");
 REGISTER_OPTYPE_DECLARE(SQRTGRAD, "SqrtGrad");
 REGISTER_OPTYPE_DECLARE(SIGMOIDGRAD, "SigmoidGrad");

 // Horovod operator
 REGISTER_OPTYPE_DECLARE(HVDCALLBACKALLREDUCE, "HorovodAllreduce");
 REGISTER_OPTYPE_DECLARE(HVDCALLBACKALLGATHER, "HorovodAllgather");
 REGISTER_OPTYPE_DECLARE(HVDCALLBACKBROADCAST, "HorovodBroadcast");
 REGISTER_OPTYPE_DECLARE(HVDWAIT, "HorovodWait");

 enum InputMode { INPUT = 0, CONST };

 // Definition of the processing status enum of the process module
--- a/inc/framework/dlog/log.h
+++ b/inc/framework/dlog/log.h
@@ -1,61 +0,0 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_DLOG_LOG_H_
 #define INC_FRAMEWORK_DLOG_LOG_H_

 #include <string>
 #if !defined(__ANDROID__) && !defined(ANDROID)
 #include "toolchain/slog.h"
 #else
 #include <android/log.h>
 #endif

 #ifdef _MSC_VER
 #define FUNC_NAME __FUNCTION__
 #else
 #define FUNC_NAME __PRETTY_FUNCTION__
 #endif

 #if !defined(__ANDROID__) && !defined(ANDROID)
 #define DAV_LOGI(MOD_NAME, fmt, ...) dlog_info(static_cast<int>(GE), "%s:" #fmt, __FUNCTION__, ##__VA_ARGS__)
 #define DAV_LOGW(MOD_NAME, fmt, ...) dlog_warn(static_cast<int>(GE), "%s:" #fmt, __FUNCTION__, ##__VA_ARGS__)
 #define DAV_LOGE(MOD_NAME, fmt, ...) dlog_error(static_cast<int>(GE), "%s:" #fmt, __FUNCTION__, ##__VA_ARGS__)
 #define DAV_LOGD(MOD_NAME, fmt, ...) dlog_debug(static_cast<int>(GE), "%s:" #fmt, __FUNCTION__, ##__VA_ARGS__)
 #define DAV_EVENT(MOD_NAME, fmt, ...) dlog_event(static_cast<int>(GE), "%s:" #fmt, __FUNCTION__, ##__VA_ARGS__)
 #else
 #define DAV_LOGI(MOD_NAME, fmt, ...) \
  __android_log_print(ANDROID_LOG_INFO, MOD_NAME, "%s %s(%d)::" #fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define DAV_LOGW(MOD_NAME, fmt, ...) \
  __android_log_print(ANDROID_LOG_WARN, MOD_NAME, "%s %s(%d)::" #fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define DAV_LOGE(MOD_NAME, fmt, ...) \
  __android_log_print(ANDROID_LOG_ERROR, MOD_NAME, "%s %s(%d)::" #fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define DAV_LOGD(MOD_NAME, fmt, ...) \
  __android_log_print(ANDROID_LOG_DEBUG, MOD_NAME, "%s %s(%d)::" #fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #define DAV_EVENT(MOD_NAME, fmt, ...) \
  __android_log_print(ANDROID_LOG_DEBUG, MOD_NAME, "%s %s(%d)::" #fmt, __FILE__, __FUNCTION__, __LINE__, ##__VA_ARGS__)
 #endif

 #define DLOG_DECLARE(level) \
  void Log_##level(const char *mod_name, const char *func, const char *file, int line, const char *format, ...)

 namespace domi {
 DLOG_DECLARE(INFO);
 DLOG_DECLARE(WARNING);
 DLOG_DECLARE(ERROR);
 }  // namespace domi

 #endif  // INC_FRAMEWORK_DLOG_LOG_H_
--- a/inc/framework/ge_runtime/task_info.h
+++ b/inc/framework/ge_runtime/task_info.h
--- a/inc/framework/ge_runtime_dummy/davinci_model.h
+++ b/inc/framework/ge_runtime_dummy/davinci_model.h
@@ -0,0 +1,113 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_GE_RUNTIME_DAVINCI_MODEL_H_
 #define INC_FRAMEWORK_GE_RUNTIME_DAVINCI_MODEL_H_

 #include <memory>
 #include <vector>

 #include "ge_runtime/op_info.h"
 #include "ge_runtime/task_info.h"

 namespace ge {
 namespace model_runner {
 class DavinciModel {
 public:
  DavinciModel(const std::vector<std::shared_ptr<TaskInfo>> &task_info_list,
               const std::vector<std::shared_ptr<OpInfo>> &data_info_list,
               const std::vector<std::shared_ptr<OpInfo>> &output_info_list,
               const std::vector<std::shared_ptr<OpInfo>> &constant_info_list,
               const std::vector<model_runner::OpInfoPtr> &variable_info_list,
               const std::vector<uint32_t> &wait_active_stream_list,
               const std::vector<uint32_t> &force_copy_stream_list, uint64_t mem_size = 0, uint64_t weight_size = 0,
               uint64_t var_size = 0, uintptr_t logic_mem_base = 0, uintptr_t logic_weight_base = 0,
               uintptr_t logic_var_base = 0, uint32_t stream_num = 0, uint32_t batch_num = 0, uint32_t event_num = 0,
               int32_t priority = 0)
      : task_info_list_(task_info_list),
        data_info_list_(data_info_list),
        output_info_list_(output_info_list),
        constant_info_list_(constant_info_list),
        variable_info_list_(variable_info_list),
        wait_active_stream_list_(wait_active_stream_list),
        force_copy_stream_list_(force_copy_stream_list),
        mem_size_(mem_size),
        weight_size_(weight_size),
        var_size_(var_size),
        logic_mem_base_(logic_mem_base),
        logic_weight_base_(logic_weight_base),
        logic_var_base_(logic_var_base),
        stream_num_(stream_num),
        batch_num_(batch_num),
        event_num_(event_num),
        priority_(priority) {}
  ~DavinciModel() {}

  uint64_t GetMemSize() const { return mem_size_; }
  uint64_t GetWeightSize() const { return weight_size_; }
  uint64_t GetVarSize() const { return var_size_; }

  uintptr_t GetLogicMemBase() const { return logic_mem_base_; }
  uintptr_t GetLogicWeightBase() const { return logic_weight_base_; }
  uintptr_t GetLogicVarBase() const { return logic_var_base_; }

  uint32_t GetStreamNum() const { return stream_num_; }
  uint32_t GetBatchNum() const { return batch_num_; }
  uint32_t GetEventNum() const { return event_num_; }

  const std::vector<uint32_t> &GetWaitActiveStreams() const { return wait_active_stream_list_; }
  const std::vector<uint32_t> &GetForceCopyStreams() const { return force_copy_stream_list_; }

  int32_t GetPriority() const { return priority_; }

  const std::vector<std::shared_ptr<TaskInfo>> &GetTaskInfoList() const { return task_info_list_; }
  const std::vector<std::shared_ptr<OpInfo>> &GetDataInfoList() const { return data_info_list_; }
  const std::vector<std::shared_ptr<OpInfo>> &GetOutputInfoList() const { return output_info_list_; }
  const std::vector<std::shared_ptr<OpInfo>> &GetConstantInfoList() const { return output_info_list_; }
  const std::vector<model_runner::OpInfoPtr> &GetVariableInfoList() const { return variable_info_list_; }

 private:
  std::vector<std::shared_ptr<TaskInfo>> task_info_list_;
  std::vector<std::shared_ptr<OpInfo>> data_info_list_;
  std::vector<std::shared_ptr<OpInfo>> output_info_list_;
  std::vector<std::shared_ptr<OpInfo>> constant_info_list_;
  std::vector<model_runner::OpInfoPtr> variable_info_list_;

  std::vector<uint32_t> wait_active_stream_list_;
  std::vector<uint32_t> force_copy_stream_list_;

  uint64_t mem_size_;
  uint64_t weight_size_;
  uint64_t var_size_;

  uintptr_t logic_mem_base_;
  uintptr_t logic_weight_base_;
  uintptr_t logic_var_base_;

  uint32_t stream_num_;
  uint32_t batch_num_;
  uint32_t event_num_;

  int32_t priority_;

  // Disable to copy constructor and assignment operator
  DavinciModel &operator=(const DavinciModel &) = delete;
  DavinciModel(const DavinciModel &) = delete;
 };
 }  // namespace model_runner
 }  // namespace ge

 #endif  // INC_FRAMEWORK_GE_RUNTIME_DAVINCI_MODEL_H_
--- a/inc/framework/ge_runtime_dummy/model_runner.h
+++ b/inc/framework/ge_runtime_dummy/model_runner.h
@@ -0,0 +1,58 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_GE_RUNTIME_MODEL_RUNNER_H_
 #define INC_FRAMEWORK_GE_RUNTIME_MODEL_RUNNER_H_

 #include <memory>
 #include <unordered_map>
 #include <vector>

 #include "common/ge_inner_error_codes.h"
 #include "common/ge_types.h"
 #include "ge_runtime/davinci_model.h"

 namespace ge {
 namespace model_runner {
 class RuntimeModel;

 class ModelRunner {
 public:
  static ModelRunner &Instance();

  bool LoadDavinciModel(uint32_t device_id, uint64_t session_id, uint32_t model_id,
                        std::shared_ptr<DavinciModel> davinci_model, std::shared_ptr<ModelListener> listener);

  const std::vector<uint32_t> &GetTaskIdList(uint32_t model_id) const;

  bool UnloadModel(uint32_t model_id);

  bool RunModel(uint32_t model_id, const InputData &input_data, OutputData *output_data);

  bool GetInputOutputDescInfo(uint32_t model_id, bool zero_copy, std::vector<InputOutputDescInfo> *input_desc,
                              std::vector<InputOutputDescInfo> *output_desc, std::vector<uint32_t> *input_format,
                              std::vector<uint32_t> *output_format);

 private:
  ModelRunner() = default;
  ~ModelRunner() = default;

  std::unordered_map<uint32_t, std::shared_ptr<RuntimeModel>> runtime_models_;
 };
 }  // namespace model_runner
 }  // namespace ge

 #endif  // INC_FRAMEWORK_GE_RUNTIME_MODEL_RUNNER_H_
--- a/inc/framework/ge_runtime_dummy/op_info.h
+++ b/inc/framework/ge_runtime_dummy/op_info.h
@@ -0,0 +1,72 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_GE_RUNTIME_OP_INFO_H_
 #define INC_FRAMEWORK_GE_RUNTIME_OP_INFO_H_

 #include <memory>
 #include <string>
 #include <vector>

 namespace ge {
 namespace model_runner {
 struct TensorInfo {
  int64_t GetShapeSize() const {
    int64_t res = 1;
    if (dims.empty()) {
      return 0;
    }
    for (auto dim : dims) {
      res *= dim;
    }
    return res;
  }

  int64_t GetDim(uint32_t index) {
    if (index >= dims.size()) {
      return 0;
    }
    return dims[index];
  }

  std::vector<int64_t> dims;
  uint32_t datatype;
  uint32_t format;
  uint32_t real_dim_cnt;
  uint32_t size;
  bool is_output;
 };

 struct OpInfo {
  uint32_t index;
  std::string name;
  std::string type;
  bool var_is_broadcast;
  std::vector<uintptr_t> input_addrs;
  std::vector<uintptr_t> output_addrs;
  std::vector<TensorInfo> input_tensors;
  std::vector<TensorInfo> output_tensors;
  std::vector<TensorInfo> weight_tensors;
  std::vector<std::string> src_name;
  std::vector<int64_t> src_index;
  std::string weight_data;
 };

 using TensorInfoPtr = std::shared_ptr<TensorInfo>;
 using OpInfoPtr = std::shared_ptr<OpInfo>;
 }  // namespace model_runner
 }  // namespace ge
 #endif  // INC_FRAMEWORK_GE_RUNTIME_OP_INFO_H_
--- a/inc/framework/ge_runtime_dummy/task_info.h
+++ b/inc/framework/ge_runtime_dummy/task_info.h
@@ -0,0 +1,394 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_GE_RUNTIME_TASK_INFO_H_
 #define INC_FRAMEWORK_GE_RUNTIME_TASK_INFO_H_

 #include <stdint.h>
 #include <functional>
 #include <memory>
 #include <string>
 #include <vector>

 #include "cce/taskdown_api.h"

 namespace ge {
 namespace model_runner {
 enum TaskInfoType {
  CCE = 0,
  TBE,
  AICPU,
  LABEL_SET,
  LABEL_SWITCH,
  LABEL_GOTO,
  EVENT_RECORD,
  EVENT_WAIT,
  FUSION_START,
  FUSION_END,
  HCCL,
  PROFILER_TRACE,
  MEMCPY_ASYNC,
  STREAM_SWITCH,
  STREAM_ACTIVE,
  // Insert new task type here
  REVSERVED = 23
 };

 class TaskInfo {
 public:
  virtual ~TaskInfo() {}
  uint32_t stream_id() const { return stream_id_; }
  TaskInfoType type() const { return type_; }

 protected:
  TaskInfo(uint32_t stream_id, TaskInfoType type) : stream_id_(stream_id), type_(type) {}

 private:
  uint32_t stream_id_;
  TaskInfoType type_;
 };

 class CceTaskInfo : public TaskInfo {
 public:
  CceTaskInfo(uint32_t stream_id, const cce::ccOpContext &ctx, const std::string &stub_func, uint32_t block_dim,
              const std::vector<uint8_t> &args, uint32_t args_size, const std::vector<uint8_t> &sm_desc,
              const std::vector<uint8_t> &flow_table, const std::vector<uint8_t> &args_offset, bool is_flowtable)
      : TaskInfo(stream_id, TaskInfoType::CCE),
        ctx_(ctx),
        stub_func_(stub_func),
        block_dim_(block_dim),
        args_(args),
        args_size_(args_size),
        sm_desc_(sm_desc),
        flow_table_(flow_table),
        args_offset_(args_offset),
        is_flowtable_(is_flowtable) {}
  ~CceTaskInfo() override {}

  cce::ccOpContext cc_context() const { return ctx_; }
  std::string stub_func() const { return stub_func_; }
  uint32_t block_dim() const { return block_dim_; }
  const std::vector<uint8_t> &args() const { return args_; }
  uint32_t args_size() const { return args_size_; }
  const std::vector<uint8_t> &sm_desc() const { return sm_desc_; }
  const std::vector<uint8_t> &flow_table() const { return flow_table_; }
  const std::vector<uint8_t> &args_offset() const { return args_offset_; }
  bool is_flowtable() const { return is_flowtable_; }

 private:
  cce::ccOpContext ctx_;
  std::string stub_func_;
  uint32_t block_dim_;
  std::vector<uint8_t> args_;
  uint32_t args_size_;
  std::vector<uint8_t> sm_desc_;
  std::vector<uint8_t> flow_table_;
  std::vector<uint8_t> args_offset_;
  bool is_flowtable_;
 };

 class TbeTaskInfo : public TaskInfo {
 public:
  TbeTaskInfo(uint32_t stream_id, const std::string &stub_func, uint32_t block_dim, const std::vector<uint8_t> &args,
              uint32_t args_size, const std::vector<uint8_t> &sm_desc, void *binary, uint32_t binary_size,
              const std::vector<uint8_t> &meta_data, const std::vector<void *> &input_data_addrs,
              const std::vector<void *> &output_data_addrs, const std::vector<void *> &workspace_addrs)
      : TaskInfo(stream_id, TaskInfoType::TBE),
        stub_func_(stub_func),
        block_dim_(block_dim),
        args_(args),
        args_size_(args_size),
        sm_desc_(sm_desc),
        binary_(binary),
        binary_size_(binary_size),
        meta_data_(meta_data),
        input_data_addrs_(input_data_addrs),
        output_data_addrs_(output_data_addrs),
        workspace_addrs_(workspace_addrs) {}
  ~TbeTaskInfo() override {}

  const std::string &stub_func() const { return stub_func_; }
  uint32_t block_dim() const { return block_dim_; }
  const std::vector<uint8_t> &args() const { return args_; }
  uint32_t args_size() const { return args_size_; }
  const std::vector<uint8_t> &sm_desc() const { return sm_desc_; }
  void *binary() const { return binary_; }
  uint32_t binary_size() const { return binary_size_; }
  const std::vector<uint8_t> &meta_data() const { return meta_data_; }
  const std::vector<void *> &input_data_addrs() const { return input_data_addrs_; }
  const std::vector<void *> &output_data_addrs() const { return output_data_addrs_; }
  const std::vector<void *> &workspace_addrs() const { return workspace_addrs_; }

  void SetBinary(void *binary, uint32_t binary_size) {
    binary_ = binary;
    binary_size_ = binary_size;
  }

 private:
  std::string stub_func_;
  uint32_t block_dim_;
  std::vector<uint8_t> args_;
  uint32_t args_size_;
  std::vector<uint8_t> sm_desc_;
  void *binary_;
  uint32_t binary_size_;
  std::vector<uint8_t> meta_data_;
  std::vector<void *> input_data_addrs_;
  std::vector<void *> output_data_addrs_;
  std::vector<void *> workspace_addrs_;
 };

 class AicpuTaskInfo : public TaskInfo {
 public:
  AicpuTaskInfo(uint32_t stream_id, const string &so_name, const std::string &kernel_name, const std::string &node_def,
                const std::vector<void *> &input_data_addrs, const std::vector<void *> &output_data_addrs)
      : TaskInfo(stream_id, TaskInfoType::AICPU),
        so_name_(so_name),
        kernel_name_(kernel_name),
        node_def_(node_def),
        input_data_addrs_(input_data_addrs),
        output_data_addrs_(output_data_addrs) {}
  ~AicpuTaskInfo() override {}

  const std::string &so_name() const { return so_name_; }
  const std::string &kernel_name() const { return kernel_name_; }
  const std::string &node_def() const { return node_def_; }
  const std::vector<void *> &input_data_addrs() const { return input_data_addrs_; }
  const std::vector<void *> &output_data_addrs() const { return output_data_addrs_; }

 private:
  std::string so_name_;
  std::string kernel_name_;
  std::string node_def_;
  std::vector<void *> input_data_addrs_;
  std::vector<void *> output_data_addrs_;
 };

 class LabelTaskInfo : public TaskInfo {
 public:
  uint32_t label_id() const { return label_id_; }

 protected:
  LabelTaskInfo(uint32_t stream_id, TaskInfoType type, uint32_t label_id)
      : TaskInfo(stream_id, type), label_id_(label_id) {}
  virtual ~LabelTaskInfo() override {}

  uint32_t label_id_;
 };

 class LabelSetTaskInfo : public LabelTaskInfo {
 public:
  LabelSetTaskInfo(uint32_t stream_id, uint32_t label_id)
      : LabelTaskInfo(stream_id, TaskInfoType::LABEL_SET, label_id) {}
  ~LabelSetTaskInfo() override {}
 };

 class LabelSwitchTaskInfo : public LabelTaskInfo {
 public:
  LabelSwitchTaskInfo(uint32_t stream_id, uint32_t label_id)
      : LabelTaskInfo(stream_id, TaskInfoType::LABEL_SWITCH, label_id) {}
  ~LabelSwitchTaskInfo() override {}
 };

 class LabelGotoTaskInfo : public LabelTaskInfo {
 public:
  LabelGotoTaskInfo(uint32_t stream_id, uint32_t label_id)
      : LabelTaskInfo(stream_id, TaskInfoType::LABEL_GOTO, label_id) {}
  ~LabelGotoTaskInfo() override {}
 };

 class EventTaskInfo : public TaskInfo {
 public:
  uint32_t event_id() const { return event_id_; }

 protected:
  EventTaskInfo(uint32_t stream_id, TaskInfoType type, uint32_t event_id)
      : TaskInfo(stream_id, type), event_id_(event_id) {}
  virtual ~EventTaskInfo() override {}

  uint32_t event_id_;
 };

 class EventRecordTaskInfo : public EventTaskInfo {
 public:
  EventRecordTaskInfo(uint32_t stream_id, uint32_t event_id)
      : EventTaskInfo(stream_id, TaskInfoType::EVENT_RECORD, event_id) {}
  ~EventRecordTaskInfo() override {}
 };

 class EventWaitTaskInfo : public EventTaskInfo {
 public:
  EventWaitTaskInfo(uint32_t stream_id, uint32_t event_id)
      : EventTaskInfo(stream_id, TaskInfoType::EVENT_WAIT, event_id) {}
  ~EventWaitTaskInfo() override {}
 };

 class FusionStartTaskInfo : public TaskInfo {
 public:
  explicit FusionStartTaskInfo(uint32_t stream_id) : TaskInfo(stream_id, TaskInfoType::FUSION_START) {}
  ~FusionStartTaskInfo() override {}
 };

 class FusionEndTaskInfo : public TaskInfo {
 public:
  explicit FusionEndTaskInfo(uint32_t stream_id) : TaskInfo(stream_id, TaskInfoType::FUSION_END) {}
  ~FusionEndTaskInfo() override {}
 };

 class HcclTaskInfo : public TaskInfo {
 public:
  HcclTaskInfo(uint32_t stream_id, const std::string hccl_type, void *input_data_addr, void *output_data_addr,
               void *workspace_addr, int64_t workspace_size, int64_t hccl_stream_num,
               const std::vector<uint8_t> &private_def, void *ops_kernel_store, int32_t count, int64_t root_id,
               int64_t op_type, int64_t data_type, std::function<bool(void *, void *)> hcom_bind_model,
               std::function<bool(void *)> hcom_unbind_model,
               std::function<bool(std::shared_ptr<HcclTaskInfo>, void *)> hcom_distribute_task)
      : TaskInfo(stream_id, TaskInfoType::HCCL),
        hccl_type_(hccl_type),
        input_data_addr_(input_data_addr),
        output_data_addr_(output_data_addr),
        workspace_addr_(workspace_addr),
        workspace_size_(workspace_size),
        hccl_stream_num_(hccl_stream_num),
        private_def_(private_def),
        ops_kernel_store_(ops_kernel_store),
        count_(count),
        root_id_(root_id),
        op_type_(op_type),
        data_type_(data_type),
        hcom_bind_model_(hcom_bind_model),
        hcom_unbind_model_(hcom_unbind_model),
        hcom_distribute_task_(hcom_distribute_task) {}
  ~HcclTaskInfo() override {}

  const std::string &hccl_type() const { return hccl_type_; }
  void *input_data_addr() const { return input_data_addr_; }
  void *output_data_addr() const { return output_data_addr_; }
  void *workspace_addr() const { return workspace_addr_; }
  int64_t workspace_size() const { return workspace_size_; }
  int64_t hccl_stream_num() const { return hccl_stream_num_; }
  const std::vector<uint8_t> &private_def() const { return private_def_; }
  void *ops_kernel_store() const { return ops_kernel_store_; }
  int32_t count() const { return count_; }
  int64_t root_id() const { return root_id_; }
  int64_t op_type() const { return op_type_; }
  int64_t data_type() const { return data_type_; }
  std::function<bool(void *, void *)> hcom_bind_model() const { return hcom_bind_model_; }
  std::function<bool(void *)> hcom_unbind_model() const { return hcom_unbind_model_; }
  std::function<bool(std::shared_ptr<HcclTaskInfo>, void *)> hcom_distribute_task() const {
    return hcom_distribute_task_;
  }

 private:
  std::string hccl_type_;
  void *input_data_addr_;
  void *output_data_addr_;
  void *workspace_addr_;
  int64_t workspace_size_;
  int64_t hccl_stream_num_;
  std::vector<uint8_t> private_def_;
  void *ops_kernel_store_;
  int32_t count_;
  int64_t root_id_;
  int64_t op_type_;
  int64_t data_type_;
  std::function<bool(void *, void *)> hcom_bind_model_;
  std::function<bool(void *)> hcom_unbind_model_;
  std::function<bool(std::shared_ptr<HcclTaskInfo>, void *)> hcom_distribute_task_;
 };

 class ProfilerTraceTaskInfo : public TaskInfo {
 public:
  ProfilerTraceTaskInfo(uint32_t stream_id, uint64_t log_id, bool notify, uint32_t flat)
      : TaskInfo(stream_id, TaskInfoType::PROFILER_TRACE), log_id_(log_id), notify_(notify), flat_(flat) {}
  ~ProfilerTraceTaskInfo() override {}

  uint64_t log_id() const { return log_id_; }
  bool notify() const { return notify_; }
  uint32_t flat() const { return flat_; }

 private:
  uint64_t log_id_;
  bool notify_;
  uint32_t flat_;
 };

 class MemcpyAsyncTaskInfo : public TaskInfo {
 public:
  MemcpyAsyncTaskInfo(uint32_t stream_id, void *dst, uint64_t dst_max, void *src, uint64_t count, uint32_t kind)
      : TaskInfo(stream_id, TaskInfoType::MEMCPY_ASYNC),
        dst_(dst),
        dst_max_(dst_max),
        src_(src),
        count_(count),
        kind_(kind) {}
  ~MemcpyAsyncTaskInfo() override {}

  void *dst() const { return dst_; }
  uint64_t dst_max() const { return dst_max_; }
  void *src() const { return src_; }
  uint64_t count() const { return count_; }
  uint32_t kind() const { return kind_; }

 private:
  void *dst_;
  uint64_t dst_max_;
  void *src_;
  uint64_t count_;
  int32_t kind_;
 };

 class StreamSwitchTaskInfo : public TaskInfo {
 public:
  StreamSwitchTaskInfo(uint32_t stream_id, int64_t true_stream_id, void *input_addr, void *value_addr, int64_t cond,
                       int64_t data_type)
      : TaskInfo(stream_id, TaskInfoType::STREAM_SWITCH),
        true_stream_id_(true_stream_id),
        input_addr_(input_addr),
        value_addr_(value_addr),
        cond_(cond),
        data_type_(data_type) {}
  ~StreamSwitchTaskInfo() override {}

  int64_t true_stream_id() const { return true_stream_id_; }
  void *input_addr() const { return input_addr_; }
  void *value_addr() const { return value_addr_; }
  int64_t cond() const { return cond_; }
  int64_t data_type() const { return data_type_; }

 private:
  int64_t true_stream_id_;
  void *input_addr_;
  void *value_addr_;
  int64_t cond_;
  int64_t data_type_;
 };

 class StreamActiveTaskInfo : public TaskInfo {
 public:
  StreamActiveTaskInfo(uint32_t stream_id, uint32_t active_stream_id)
      : TaskInfo(stream_id, TaskInfoType::STREAM_ACTIVE), active_stream_id_(active_stream_id) {}
  ~StreamActiveTaskInfo() override {}

  uint32_t active_stream_id() const { return active_stream_id_; }

 private:
  uint32_t active_stream_id_;
 };
 }  // namespace model_runner
 }  // namespace ge

 #endif  // INC_FRAMEWORK_GE_RUNTIME_TASK_INFO_H_
--- a/inc/framework/generator/ge_generator.h
+++ b/inc/framework/generator/ge_generator.h
@@ -23,6 +23,7 @@
 #include <vector>
 #include "ge/ge_ir_build.h"
 #include "common/ge_inner_error_codes.h"
 #include "common/ge_types.h"
 #include "graph/ge_tensor.h"
 #include "graph/graph.h"
 #include "graph/op_desc.h"
@@ -30,9 +31,13 @@
 namespace ge {
 class GeGenerator {
 public:
  static GeGenerator &GetInstance() {
    static GeGenerator Instance;
    return Instance;
  }
  GeGenerator() = default;

  ~GeGenerator() = default;
  ~GeGenerator() { (void)Finalize(); }

  GeGenerator(const GeGenerator &) = delete;

@@ -60,10 +65,25 @@ class GeGenerator {
  ///
  Status BuildSingleOpModel(OpDescPtr &op_desc, const std::vector<GeTensor> &inputs,
                            const std::vector<GeTensor> &outputs, const std::string &model_file_name);
  ///
  /// @ingroup ge
  /// @brief: Build single Op into model buff.
  /// @param [in] op_desc: the OP description.
  /// @param [in] inputs: input tensors.
  /// @param [in] outputs: output tensors.
  /// @param [in] engine_type: specific engine.
  /// @param [out] model_buff: model buff of single op.
  /// @return SUCCESS or FAILED
  Status BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                            OpEngineType engine_type, ModelBufferData &model_buff);

 private:
  Status GenerateModel(const Graph &graph, const string &file_name_prefix, const vector<GeTensor> &inputs,
                       ge::ModelBufferData &model, bool is_offline = true);
  Status BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                       const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
                       bool is_offline = true);

  class Impl;

  std::shared_ptr<Impl> impl_;
--- a/inc/framework/omg/omg.h
+++ b/inc/framework/omg/omg.h
@@ -0,0 +1,113 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_FRAMEWORK_OMG_OMG_H_
 #define INC_FRAMEWORK_OMG_OMG_H_

 #include <google/protobuf/message.h>
 #include <string>
 #include <unordered_map>
 #include <vector>
 #include "framework/common/types.h"
 #include "framework/omg/omg_inner_types.h"
 #include "proto/ge_ir.pb.h"
 #include "proto/om.pb.h"

 #include "graph/compute_graph.h"
 #include "graph/graph.h"
 #include "graph/model.h"
 #include "runtime/kernel.h"

 using domi::Status;
 using std::pair;
 using std::string;
 using std::unordered_map;
 using std::vector;

 namespace ge {
 /**
 * @ingroup domi_omg
 * @brief init omg context
 * @return void
 */
 Status InitDomiOmgContext(const string &input_shape, const string &input_format, const string &net_format,
                          bool is_dynamic_input);

 /**
 * @ingroup domi_omg
 * @brief generate graph based on the input model file and weight file
 * @param [out] graph graph
 * @param [in] model_file path of model file
 * @param [in] weights_file path of weight file
 * @param [in] type type of the input model
 * @param [in] op_conf op mapping configuration
 * @param [in] target type of platform. If a tiny model is generated, set target to tiny
 * @param [in] run_mode run model
 * @param [in] enable_l2dynamic enable l2dynamic
 * @param [in] is_dynamic_input dynamic input, true of false
 * @param [in] atc_params multiply atc params
 * @return Status result code
 */
 Status ParseGraph(ge::Graph &graph, const std::map<string, string> &atc_params, const char *model_file,
                  const char *weights_file, domi::FrameworkType type, const char *op_conf = nullptr,
                  const char *target = nullptr, RunMode run_mode = GEN_OM_MODEL, bool is_dynamic_input = false);

 /**
 * @ingroup domi_omg
 * @brief generates a simplified JSON file based on the key value of the offline model file in protobuf format
 * @param [in] model_file path of offline model file
 * @param [out] json_file path of json file
 * @param [key] encrypted key
 * @return Status result code
 */
 Status ConvertOmModelToJson(const char *model_file, const char *json_file);

 Status ConvertPbtxtToJson(const char *model_file, const char *json_file);
 /**
 * @ingroup domi_omg
 * @brief convert the model file in protobuf format into a JSON file.
 * @param [in] framework type of model
 * @param [in] om model_file path of offline model file
 * @param [out] json_file path of json file
 * @param [key] encrypted key
 * @return Status result code
 */
 Status ConvertFwkModelToJson(domi::FrameworkType framework, const char *model_file, const char *json_file);

 void GetGroupName(ge::proto::ModelDef &model);

 void FindParserSo(const string &path, vector<string> &fileList, string &caffe_parser_path);

 Status CheckCustomAiCpuOpLib();

 Status DumpInfershapeJson(const ge::Graph &graph, const char *json_file);

 Status SetOutputNodeInfo(ge::Graph &graph, const std::string &output_type, const std::string &output_format);

 Status GetOutputLeaf(ge::NodePtr node, std::vector<std::pair<ge::NodePtr, int32_t>> &output_nodes_info,
                     std::vector<std::string> &output_nodes_name);
 }  // namespace ge

 namespace domi {
 /**
 * @ingroup domi_omg
 * @brief get omg context
 * @return reference of OmgContext
 */
 ge::OmgContext &GetContext();
 }  // namespace domi

 #endif  // INC_FRAMEWORK_OMG_OMG_H_
--- a/inc/framework/omg/omg_inner_types.h
+++ b/inc/framework/omg/omg_inner_types.h
@@ -44,11 +44,10 @@ namespace ge {
 * @brief run model
 */
 enum RunMode {
  GEN_OM_MODEL = 0,              // generate offline model file
  MODEL_TO_JSON = 1,             // convert to JSON file
  MODEL_TO_JSON_WITH_SHAPE = 2,  // convert to json file with shape
  ONLY_PRE_CHECK = 3,            // only for pre-check
  PBTXT_TO_JSON = 5              // pbtxt to json
  GEN_OM_MODEL = 0,    // generate offline model file
  MODEL_TO_JSON = 1,   // convert to JSON file
  ONLY_PRE_CHECK = 3,  // only for pre-check
  PBTXT_TO_JSON = 5    // pbtxt to json
 };

 ///
@@ -93,6 +92,8 @@ struct OmgContext {
  std::map<std::string, std::vector<int32_t>> out_nodes_map;
  // user-designate out nodes (this is used for determing the orders)
  std::vector<std::pair<std::string, int32_t>> user_out_nodes;
  // net out nodes (where user_out_nodes or leaf nodes)
  std::vector<std::string> net_out_nodes;
  // path for the aicpu custom operator so_file
  std::vector<std::string> aicpu_op_run_paths;
  // ddk version
--- a/inc/graph/compute_graph.h
+++ b/inc/graph/compute_graph.h
@@ -235,6 +235,8 @@ class ComputeGraph : public std::enable_shared_from_this<ComputeGraph>, public A
                                    std::vector<NodePtr> &stack);
  graphStatus BFSTopologicalSorting(std::vector<NodePtr> &node_vec, std::map<NodePtr, uint32_t> &map_in_edge_num,
                                    std::deque<NodePtr> &stack);
  graphStatus BFSTopologicalSortingWithGroup(std::vector<NodePtr> &node_vec,
                                             std::map<NodePtr, uint32_t> &map_in_edge_num, std::deque<NodePtr> &stack);
  graphStatus CollectBreadthOutNode(const NodePtr &node, std::map<NodePtr, uint32_t> &map_in_edge_num,
                                    std::map<string, NodePtr> &breadth_node_map);
  graphStatus TopologicalSortingGraph();
--- a/inc/graph/debug/ge_attr_define.h
+++ b/inc/graph/debug/ge_attr_define.h
@@ -94,6 +94,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FORMAT;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STORAGE_FORMAT;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STORAGE_SHAPE;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_FILTER_FORMAT;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LRN_K;
@@ -133,6 +137,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string NEW_AIPP_CONV_OP;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SESSION_GRAPH_ID;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_PARENT_GRAPH_NAME;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_MULTISHAPE_BATCHLIST;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_MULTISHAPE_BATCHLIST_SIZE;
@@ -692,6 +697,8 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MOD

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_ZERO_COPY_MEMORY_SIZE;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_OUT_NODES_NAME;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_WEIGHT_SIZE;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_MODEL_TASK_GEN_BASE_ADDR;
@@ -920,6 +927,7 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_STREAM_SWITCH_COND;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_ACTIVE_STREAM_LIST;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCHN_PRED_VALUE;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SUBGRAPH_FIRST_ACTIVE;

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCH_BRANCH_NODE_LABEL;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG;
@@ -999,14 +1007,17 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SWITCH_FOR_L2_FUSION;

 // functional ops attr
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IF_THEN_BRANCH;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_IF_ELSE_BRANCH;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WHILE_COND;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_WHILE_BODY;

 // used for label switch
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_INDEX;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_LABEL_SWITCH_LIST;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_SUBGRAPH_END_NODE;

 // Varible
 // Variable
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_SRC_VAR_NAME;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string VAR_ATTR_SRC_VAR_NAME;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string REF_VAR_PRE_PEER_OUT_INDEX;
@@ -1032,6 +1043,13 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAM

 // Dynamic stitch
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string DYNAMIC_STITCH_ATTR_NAME_NUM;

 // Used for support Horovod
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_INTER_EVENT_IDENTIFY;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_HOROVOD_ATTR_REDUCE_TYPE;
 // for gradient group
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_HCCL_FUSED_GROUP;
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY extern const std::string ATTR_NAME_HCCL_FUSED_FLAG;
 }  // namespace ge

 #endif  // INC_GRAPH_DEBUG_GE_ATTR_DEFINE_H_
--- a/inc/graph/op_desc.h
+++ b/inc/graph/op_desc.h
@@ -264,6 +264,8 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {
  graphStatus SetSubgraphInstanceName(uint32_t index, const std::string &name);
  void RemoveSubgraphInstanceName(const std::string &name);

  graphStatus GetSubgraphNameByInstanceName(const std::string &instance_name, std::string &subgraph_name) const;

 protected:
  ProtoAttrMapHelper MutableAttrMap() override;
  ConstProtoAttrMapHelper GetAttrMap() const override;
@@ -288,7 +290,7 @@ class OpDesc : public std::enable_shared_from_this<OpDesc>, public AttrHolder {

  // subgraph ir names to type, for a `if` operator:
  // then_branch: static
  // else_branch: dynamic
  // else_branch: static
  // or for a `case` op:
  // branches: dynamic
  std::map<std::string, SubgraphType> subgraph_ir_names_to_type_;
--- a/inc/graph/runtime_inference_context.h
+++ b/inc/graph/runtime_inference_context.h
@@ -0,0 +1,46 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef INC_GRAPH_RUNTIME_INFERENCE_CONTEXT_H_
 #define INC_GRAPH_RUNTIME_INFERENCE_CONTEXT_H_

 #include <map>
 #include <memory>
 #include <mutex>
 #include <vector>
 #include "external/graph/ge_error_codes.h"
 #include "external/graph/tensor.h"

 namespace ge {
 class GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY RuntimeInferenceContext {
 public:
  static graphStatus GetContext(const std::string &context_id, RuntimeInferenceContext **ctx);
  static graphStatus CreateContext(const std::string &context_id);
  static void DestroyContext(const std::string &context_id);

  graphStatus SetTensor(int64_t node_id, int output_id, Tensor &&tensor);
  graphStatus GetTensor(int64_t node_id, int output_id, Tensor &tensor);

 private:
  std::map<int64_t, std::vector<Tensor>> tensors_;
  std::mutex mu_;

  static std::map<std::string, std::unique_ptr<RuntimeInferenceContext>> contexts_;
  static std::mutex ctx_mu_;
 };
 }  // namespace ge

 #endif  // INC_GRAPH_RUNTIME_INFERENCE_CONTEXT_H_
--- a/inc/graph/utils/graph_utils.h
+++ b/inc/graph/utils/graph_utils.h
@@ -29,6 +29,18 @@
 #include "graph/graph.h"
 #include "graph/model.h"

 #define GE_DUMP(compute_graph, name)                                                                   \
  do {                                                                                                 \
    GraphUtils::DumpGEGraph(compute_graph, name);                                                      \
    GraphUtils::DumpGEGraphToOnnx(*compute_graph, name);                                               \
    for (const auto &sub_graph_func : compute_graph->GetAllSubgraphs()) {                              \
      static int8_t i = 0;                                                                             \
      auto sub_graph_func_name = std::string(name) + std::string("_sub_graph_") + std::to_string(i++); \
      GraphUtils::DumpGEGraph(sub_graph_func, sub_graph_func_name);                                    \
      GraphUtils::DumpGEGraphToOnnx(*sub_graph_func, sub_graph_func_name);                             \
    }                                                                                                  \
  } while (0)

 #define REFER_ATTR_VALUE(VT_ENUM, DataType, attr, ret) \
  do {                                                 \
    DataType ret;                                      \
@@ -155,6 +167,8 @@ class GraphUtils {
  static graphStatus InsertNodeBetweenDataAnchors(const OutDataAnchorPtr &src, const InDataAnchorPtr &dst,
                                                  const NodePtr &new_node);

  static graphStatus RemoveSubgraphRecursively(const ComputeGraphPtr &compute_graph, const NodePtr &remove_node);

  static graphStatus RemoveNodeWithoutRelink(const ComputeGraphPtr &compute_graph, const NodePtr &node);

  static graphStatus InsertTransNode(ComputeGraphPtr compute_graph, const InDataAnchorPtr &in_data_anchor,
@@ -299,6 +313,24 @@ class GraphUtils {
                                   std::map<std::string, std::vector<NodeIndexIO>> &symbol_to_anchors,
                                   std::map<std::string, std::string> &anchor_to_symbol);

  ///
  /// Determine if the graph is a UNKNOWN_SHAPE graph based on whether the graph and all subgraphs
  /// of the graph have UNKNOWN_SHAPE operators or not.
  /// Note: This function will only look 'down' from the graph, not 'up'. For example, the following
  /// scenario (K for known shape, U for unknown shape), ROOT graph is UNKNOWN_SHAPE while SUB graph is KNOWN_SHAPE
  /// ROOT graph:      A -----> B -----> C
  ///                  K    subgraph     U
  ///                           |
  ///                           V
  /// SUB graph:          D --> E --> F
  ///                     K     K     K
  /// @param [in] graph
  /// @return bool
  ///
  static bool IsUnknownShapeGraph(const ComputeGraphPtr &graph);

  static NodePtr FindNodeFromAllNodes(ComputeGraphPtr &graph, const std::string &name);

 private:
  ///
  /// Get reference-mapping for in_data_anchors of node
@@ -438,6 +470,11 @@ class ComputeGraphBuilder {
  ///
  NodePtr GetNode(const std::string &name);

  /// @brief Get all nodes
  /// @return std::vector<NodePtr>
  ///
  std::vector<NodePtr> GetAllNodes();

 protected:
  ///
  /// @brief Build nodes
@@ -536,6 +573,13 @@ class CompleteGraphBuilder : public ComputeGraphBuilder {
  CompleteGraphBuilder &AddOutput(const std::string &owner_node_name, uint32_t anchor_ind);

  ///
  /// @brief Add target for graph
  /// @param [in] target_name
  /// @return CompleteGraphBuilder
  ///
  CompleteGraphBuilder &AddTarget(const std::string &target_name);

  ///
  /// @brief Set parent-node of graph
  /// @param [in] parent_node
  /// @return CompleteGraphBuilder
@@ -590,10 +634,19 @@ class CompleteGraphBuilder : public ComputeGraphBuilder {
  ///
  void AddRetValNodes(graphStatus &error_code, std::string &error_msg);

  ///
  /// @brief Build target-nodes for graph
  /// @param [out] error_code
  /// @param [out] error_msg
  /// @return void
  ///
  void BuildGraphTargets(graphStatus &error_code, std::string &error_msg);

  std::string name_;
  NodePtr parent_node_;
  std::map<uint32_t, std::pair<std::vector<std::string>, std::vector<uint32_t>>> graph_inputs_;
  std::vector<std::pair<std::string, uint32_t>> graph_outputs_;
  std::vector<std::string> graph_targets_;

  // index_of_graph_input -> in_anchor_index_of_parent_node
  std::map<uint32_t, uint32_t> input_mapping_;
--- a/inc/graph/utils/node_utils.h
+++ b/inc/graph/utils/node_utils.h
@@ -17,10 +17,23 @@
 #ifndef INC_GRAPH_UTILS_NODE_UTILS_H_
 #define INC_GRAPH_UTILS_NODE_UTILS_H_

 #include <set>
 #include <map>
 #include <vector>
 #include "graph/node.h"

 namespace ge {
 // Op types of Const like Opps.
 extern const std::set<std::string> kConstOpTypes;
 // Op types of If like Opps.
 extern const std::set<std::string> kIfOpTypes;
 // Op types of While like Opps.
 extern const std::set<std::string> kWhileOpTypes;
 // Op types of Case like Opps.
 extern const std::set<std::string> kCaseOpTypes;
 // Op types of For like Opps.
 extern const std::set<std::string> kForOpTypes;

 class NodeUtils {
 public:
  static graphStatus AddSendEventId(const NodePtr &node, const uint32_t &event_id);
@@ -94,6 +107,13 @@ class NodeUtils {
  ///
  static bool GetConstOpType(const NodePtr &in_node, std::string &op_type);

  ///
  /// @brief Remove node-related subgraphs, including subgraphs of nodes in the subgraph.
  /// @param [in] node
  /// @return return GRAPH_SUCCESS if remove successfully, other for failed.
  ///
  static graphStatus RemoveSubgraphsOnNode(const NodePtr &node);

 private:
  static std::map<NodePtr, std::vector<uint32_t>> map_send_info_;
  static std::map<NodePtr, std::vector<uint32_t>> map_recv_info_;
--- a/inc/graph/utils/type_utils.h
+++ b/inc/graph/utils/type_utils.h
@@ -24,6 +24,7 @@
 #include "graph/ge_error_codes.h"
 #include "graph/types.h"
 #include "graph/usr_types.h"
 #include "register/register_types.h"

 namespace ge {
 class TypeUtils {
@@ -37,6 +38,7 @@ class TypeUtils {
  static std::string FormatToSerialString(Format format);
  static Format SerialStringToFormat(const std::string &str);
  static Format DataFormatToFormat(const std::string &str);
  static Format DomiFormatToFormat(domi::domiTensorFormat_t domi_format);

  static graphStatus Usr2DefQuantizeFactorParams(const UsrQuantizeFactorParams &usr, QuantizeFactorParams &def);
  static graphStatus Def2UsrQuantizeFactorParams(const QuantizeFactorParams &def, UsrQuantizeFactorParams &usr);
--- a/src/common/graph/compute_graph.cc
+++ b/src/common/graph/compute_graph.cc
@@ -36,6 +36,75 @@
 namespace ge {
 namespace {
 const size_t OUTPUT_PARAM_SIZE = 2;
 bool IsUseBFS() {
  string run_mode;
  const int base = 10;
  if (ge::GetContext().GetOption(ge::OPTION_GRAPH_RUN_MODE, run_mode) == GRAPH_SUCCESS && !run_mode.empty()) {
    if (GraphRunMode(std::strtol(run_mode.c_str(), nullptr, base)) >= TRAIN) {
      return true;
    }
  } else {
    GELOGW("OPTION_GRAPH_RUN_MODE not set, use BFSTopologicalSorting by default.");
  }
  return false;
 }
 bool IsTailingOptimization() {
  string is_tailing_optimization_option;
  auto ret = GetContext().GetOption(ge::OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION, is_tailing_optimization_option);
  if (ret == GRAPH_SUCCESS) {
    GELOGI("Option ge.exec.isTailingOptimization is %s", is_tailing_optimization_option.c_str());
    // "1" means it's True from frontend option
    return is_tailing_optimization_option == "1";
  }
  GELOGW("OPTION_EXEC_ENABLE_TAILING_OPTIMIZATION not set, use BFSTopologicalSorting by default.");
  return false;
 }
 bool IsFusedNode(const NodePtr &node) {
  bool is_fused_node = false;
  AttrUtils::GetBool(node->GetOpDesc(), ATTR_NAME_HCCL_FUSED_FLAG, is_fused_node);
  return is_fused_node;
 }
 string GetGroupId(const NodePtr &node) {
  string group_id;
  AttrUtils::GetStr(node->GetOpDesc(), ATTR_NAME_HCCL_FUSED_GROUP, group_id);
  return group_id;
 }
 bool IsGroupEnd(const NodePtr &node) {
  if (GetGroupId(node).empty()) {
    return false;
  }
  if (node->GetOutDataNodesSize() == 0) {
    return true;
  }
  for (const auto &out_data_node : node->GetOutDataNodes()) {
    if (IsFusedNode(out_data_node)) {
      return true;
    }
  }
  return false;
 }
 void SplitNodeToStack(const std::map<string, NodePtr> &breadth_node_map, string current_group_id,
                      std::vector<NodePtr> &stack_input, std::deque<NodePtr> &group_stack, std::deque<NodePtr> &stack) {
  for (const auto &name_node : breadth_node_map) {
    // group first
    string group_id;
    if (AttrUtils::GetStr(name_node.second->GetOpDesc(), ATTR_NAME_HCCL_FUSED_GROUP, group_id)) {
      GELOGI("current node %s, group id: %s , current group id %s", name_node.second->GetName().c_str(),
             group_id.c_str(), current_group_id.c_str());
      if (!current_group_id.empty() && group_id != current_group_id) {
        GELOGI("node go to input_stack back: %s", name_node.second->GetName().c_str());
        (void)stack_input.insert(stack_input.begin(), name_node.second);
      } else {
        current_group_id = group_id;
        GELOGI("node go to group_stack: %s", name_node.second->GetName().c_str());
        (void)group_stack.push_front(name_node.second);
      }
      continue;
    }
    GELOGI("node go to stack: %s ", name_node.second->GetName().c_str());
    (void)stack.push_front(name_node.second);
  }
 }
 }  // namespace

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY ComputeGraph::ComputeGraph(const std::string &name)
@@ -546,24 +615,21 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY void ComputeGraph::SetParentNode(
 ///
 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 ComputeGraph::UpdateInputMapping(const std::map<uint32_t, uint32_t> &input_mapping) {
  size_t update_num = 0;
  for (auto &input : nodes_) {
    if (update_num >= input_mapping.size()) {
      break;
    }
    uint32_t cur_index = 0;
    if (!ge::AttrUtils::GetInt(input->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, cur_index)) {
      continue;
    }
    auto iter = input_mapping.find(cur_index);
    if (iter == input_mapping.end()) {
      continue;
    }
    if (!ge::AttrUtils::SetInt(input->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, iter->second)) {
      GE_LOGE("UpdateInputMapping failed: set attr ATTR_NAME_PARENT_NODE_INDEX failed.");
      return GRAPH_FAILED;
    if (input->GetType() == DATA) {
      uint32_t cur_index = 0;
      if (!ge::AttrUtils::GetInt(input->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, cur_index)) {
        continue;
      }
      auto iter = input_mapping.find(cur_index);
      if (iter == input_mapping.end()) {
        continue;
      }
      if (!ge::AttrUtils::SetInt(input->GetOpDesc(), ATTR_NAME_PARENT_NODE_INDEX, iter->second)) {
        GE_LOGE("UpdateInputMapping failed: set attr ATTR_NAME_PARENT_NODE_INDEX failed.");
        return GRAPH_FAILED;
      }
    }
    update_num++;
  }

  return GRAPH_SUCCESS;
@@ -719,10 +785,10 @@ graphStatus ComputeGraph::BFSTopologicalSorting(std::vector<NodePtr> &node_vec,
      node = stack_input.back();
      stack_input.pop_back();
    }

    node_vec.push_back(node);
    GE_CHECK_NOTNULL(node->GetOpDesc());
    GELOGD("node_vec.push_back %s", node->GetOpDesc()->GetName().c_str());

    CollectBreadthOutNode(node, map_in_edge_num, breadth_node_map);

    for (const auto &name_node : breadth_node_map) {
@@ -730,7 +796,65 @@ graphStatus ComputeGraph::BFSTopologicalSorting(std::vector<NodePtr> &node_vec,
    }
    breadth_node_map.clear();
  }
  return GRAPH_SUCCESS;
 }

 graphStatus ComputeGraph::BFSTopologicalSortingWithGroup(std::vector<NodePtr> &node_vec,
                                                         std::map<NodePtr, uint32_t> &map_in_edge_num,
                                                         std::deque<NodePtr> &stack) {
  GELOGI("Runing_Bfs_Sort_With_Group");
  std::string current_group_id;
  std::vector<NodePtr> stack_input;
  std::deque<NodePtr> group_stack;
  std::deque<NodePtr> fused_node_stack;
  std::map<string, NodePtr> breadth_node_map;
  // Record the number of non data nodes but no input nodes
  GE_CHK_BOOL_EXEC(SortNodes(stack_input, map_in_edge_num) == GRAPH_SUCCESS, return GRAPH_FAILED, "sort nodes failed");

  // Only data nodes here
  while (!stack_input.empty() || !stack.empty() || !group_stack.empty()) {
    NodePtr node = nullptr;
    if (!group_stack.empty()) {
      // Traversal node in group has priority
      node = group_stack.back();
      group_stack.pop_back();
    } else if (!stack.empty()) {
      node = stack.back();
      stack.pop_back();
    } else {
      node = stack_input.back();
      stack_input.pop_back();
    }

    if (IsFusedNode(node) && current_group_id.empty()) {
      current_group_id = node->GetName();
    }
    if (GetGroupId(node).empty() || GetGroupId(node) == current_group_id) {
      node_vec.push_back(node);
      GE_CHECK_NOTNULL(node->GetOpDesc());
      GELOGI("node_vec.push_back %s", node->GetOpDesc()->GetName().c_str());
    } else {
      if (current_group_id.empty()) {
        current_group_id = GetGroupId(node);
        node_vec.push_back(node);
        GE_CHECK_NOTNULL(node->GetOpDesc());
        GELOGI("node_vec.push_back %s", node->GetOpDesc()->GetName().c_str());
      } else {
        GELOGI("current group id is %s ,node go to input_stack back: %s", current_group_id.c_str(),
               node->GetName().c_str());
        (void)stack_input.insert(stack_input.begin(), node);
        continue;
      }
    }
    CollectBreadthOutNode(node, map_in_edge_num, breadth_node_map);
    SplitNodeToStack(breadth_node_map, current_group_id, stack_input, group_stack, stack);
    breadth_node_map.clear();
    // check the end of group
    if (IsGroupEnd(node)) {
      GELOGI("Current node %s is end of group %s.", node->GetName().c_str(), current_group_id.c_str());
      current_group_id = "";
    }
  }
  return GRAPH_SUCCESS;
 }

@@ -751,15 +875,14 @@ graphStatus ComputeGraph::CollectBreadthOutNode(const NodePtr &node, std::map<No
      }
    }
  }

  GE_IF_BOOL_EXEC(
    node->GetOutControlAnchor() != nullptr, for (AnchorPtr peer_in_anchor
                                                 : node->GetOutControlAnchor()->GetPeerAnchors()) {
  if (node->GetOutControlAnchor() != nullptr) {
    for (AnchorPtr peer_in_anchor : node->GetOutControlAnchor()->GetPeerAnchors()) {
      auto iter = map_in_edge_num.find(peer_in_anchor->GetOwnerNode());
      if (iter != map_in_edge_num.end() && 0 == --iter->second) {
        (void)breadth_node_map.emplace(peer_in_anchor->GetOwnerNode()->GetName(), peer_in_anchor->GetOwnerNode());
      }
    })
    }
  }
  return GRAPH_SUCCESS;
 }

@@ -796,21 +919,18 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus ComputeGraph::Topolog
 graphStatus ComputeGraph::TopologicalSortingGraph() {
  std::vector<NodePtr> node_vec;
  std::map<NodePtr, uint32_t> map_in_edge_num;
  bool use_BFS = false;
  string run_mode;
  const int base = 10;
  if (ge::GetContext().GetOption(ge::OPTION_GRAPH_RUN_MODE, run_mode) == GRAPH_SUCCESS && !run_mode.empty()) {
    if (GraphRunMode(std::strtol(run_mode.c_str(), nullptr, base)) >= TRAIN) {
      use_BFS = true;
    }
  } else {
    GELOGW("OPTION_GRAPH_RUN_MODE not set, use BFSTopologicalSorting by default.");
  }

  bool use_BFS = IsUseBFS();
  bool is_tailing_optimization = IsTailingOptimization();
  if (use_BFS) {
    std::deque<NodePtr> stack;
    if (BFSTopologicalSorting(node_vec, map_in_edge_num, stack) != GRAPH_SUCCESS) {
      return GRAPH_FAILED;
    if (is_tailing_optimization) {
      if (BFSTopologicalSortingWithGroup(node_vec, map_in_edge_num, stack) != GRAPH_SUCCESS) {
        return GRAPH_FAILED;
      }
    } else {
      if (BFSTopologicalSorting(node_vec, map_in_edge_num, stack) != GRAPH_SUCCESS) {
        return GRAPH_FAILED;
      }
    }
  } else {
    std::vector<NodePtr> stack;
--- a/src/common/graph/debug/ge_op_types.h
+++ b/src/common/graph/debug/ge_op_types.h
@@ -48,6 +48,12 @@ GE_REGISTER_OPTYPE(VARIABLEV2, "VariableV2");

 GE_REGISTER_OPTYPE(INPUT_TYPE, "Input");

 // Horovod operator
 GE_REGISTER_OPTYPE(HVDCALLBACKALLREDUCE, "hvdCallbackAllreduce");
 GE_REGISTER_OPTYPE(HVDCALLBACKALLGATHER, "hvdCallbackAllgather");
 GE_REGISTER_OPTYPE(HVDCALLBACKBROADCAST, "hvdCallbackBroadcast");
 GE_REGISTER_OPTYPE(HVDWAIT, "hvdWait");

 GE_REGISTER_OPTYPE(NODE_NAME_NET_OUTPUT, "Node_Output");

 GE_REGISTER_OPTYPE(RECV, "Recv");
--- a/src/common/graph/ge_attr_define.cc
+++ b/src/common/graph/ge_attr_define.cc
@@ -76,6 +76,10 @@ const std::string ATTR_NAME_ALGO = "algo";

 const std::string ATTR_NAME_FORMAT = "format";

 const std::string ATTR_NAME_STORAGE_FORMAT = "storage_format";

 const std::string ATTR_NAME_STORAGE_SHAPE = "storage_shape";

 const std::string ATTR_NAME_FILTER_FORMAT = "filter_format";

 const std::string ATTR_NAME_LRN_K = "lrn_k";
@@ -115,6 +119,7 @@ const std::string ATTR_NAME_AIPP = "aipp";
 const std::string NEW_AIPP_CONV_OP = "new_conv_op_for_aipp";

 const std::string ATTR_NAME_SESSION_GRAPH_ID = "_session_graph_id";
 const std::string ATTR_NAME_PARENT_GRAPH_NAME = "_parent_graph_name";

 const std::string ATTR_NAME_MULTISHAPE_BATCHLIST = "multi_shape_batchlist";
 const std::string ATTR_NAME_MULTISHAPE_BATCHLIST_SIZE = "multi_shape_batchlist_size";
@@ -697,6 +702,8 @@ const std::string ATTR_MODEL_MEMORY_SIZE = "memory_size";

 const std::string ATTR_MODEL_ZERO_COPY_MEMORY_SIZE = "zero_copy_memory_size";

 const std::string ATTR_MODEL_OUT_NODES_NAME = "attr_model_out_nodes_name";

 const std::string ATTR_MODEL_WEIGHT_SIZE = "weight_size";

 const std::string ATTR_MODEL_TASK_GEN_BASE_ADDR = "task_gen_base_addr";
@@ -895,6 +902,7 @@ const std::string ATTR_NAME_ACTIVE_STREAM_LIST = "active_stream_list";
 const std::string ATTR_NAME_SWITCHN_PRED_VALUE = "switch_pred_value";
 const std::string ATTR_NAME_ITERATORS_PER_LOOP = "iterations_per_loop";
 const std::string ATTR_NAME_FLOW_CTRL_NODE_FLAG = "is_flow_ctrl_node";
 const std::string ATTR_NAME_SUBGRAPH_FIRST_ACTIVE = "subgraph_first_active";

 const std::string ATTR_NAME_SWITCH_BRANCH_NODE_LABEL = "_switch_branch_node_label";
 const std::string ATTR_NAME_SWITCH_TRUE_BRANCH_FLAG = "_switch_true_branch_flag";
@@ -973,12 +981,15 @@ const std::string ATTR_NAME_DATA_DUMP_ORIGIN_FORMAT = "_datadump_origin_format";
 const std::string ATTR_NAME_DATA_DUMP_ORIGIN_DATA_TYPE = "_datadump_origin_data_type";

 // functional ops attr
 const std::string ATTR_NAME_IF_THEN_BRANCH = "then_branch";
 const std::string ATTR_NAME_IF_ELSE_BRANCH = "else_branch";
 const std::string ATTR_NAME_WHILE_COND = "cond";
 const std::string ATTR_NAME_WHILE_BODY = "body";

 // used for label switch
 const std::string ATTR_NAME_LABEL_SWITCH_INDEX = "_label_switch_index";
 const std::string ATTR_NAME_LABEL_SWITCH_LIST = "_label_switch_list";
 const std::string ATTR_NAME_SUBGRAPH_END_NODE = "_subgraph_end_node";

 const std::string ATTR_NAME_INPUT_DATATYPE = "input_datatype";
 const std::string ATTR_NAME_OUTPUT_DATATYPE = "output_datatype";
@@ -990,4 +1001,11 @@ const std::string ATTR_NAME_VALID_INPUT_SHAPE_LIST_LIST = "_valid_input_shape_li
 const std::string ATTR_NAME_VALID_OUTPUT_SHAPE_LIST_LIST = "_valid_output_shape_list_list";
 const std::string ATTR_NAME_SLICE_INPUT_OFFSET_LIST_LIST = "_input_offset_list_list";
 const std::string ATTR_NAME_SLICE_OUTPUT_OFFSET_LIST_LIST = "_input_offset_list_list";

 // used for Horovod
 const std::string ATTR_INTER_EVENT_IDENTIFY = "event_id";
 const std::string ATTR_HOROVOD_ATTR_REDUCE_TYPE = "reduce_op";
 // used for allreduce tailing optimization
 const std::string ATTR_NAME_HCCL_FUSED_GROUP = "_hccl_fused_group";
 const std::string ATTR_NAME_HCCL_FUSED_FLAG = "_hccl_fused_node";
 }  // namespace ge
--- a/src/common/graph/op_desc.cc
+++ b/src/common/graph/op_desc.cc
@@ -878,7 +878,7 @@ graphStatus OpDesc::CommonVerify() const {
    // Checking shape of all inputs
    vector<int64_t> ishape = GetInputDescPtr(iname)->GetShape().GetDims();
    for (int64_t dim : ishape) {
      GE_CHK_BOOL_RET_STATUS(dim >= -1, GRAPH_FAILED, "operator input %s shape contains negative or zero dimension.",
      GE_CHK_BOOL_RET_STATUS(dim >= -2, GRAPH_FAILED, "operator input %s shape contains negative or zero dimension.",
                             iname.c_str());
    }
  }
@@ -1310,4 +1310,25 @@ OpDesc::GetSubgraphTypeByIrName(const std::string &name) const {
  }
  return iter->second;
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 OpDesc::GetSubgraphNameByInstanceName(const std::string &instance_name, std::string &subgraph_name) const {
  for (size_t idx = 0; idx < subgraph_instance_names_.size(); ++idx) {
    if (subgraph_instance_names_[idx] != instance_name) {  // find subgraph index.
      continue;
    }

    for (auto name_to_index : subgraph_names_to_index_) {
      if (name_to_index.second != idx) {  // find subgraph name.
        continue;
      }

      subgraph_name = name_to_index.first;
      return GRAPH_SUCCESS;
    }
  }

  return GRAPH_PARAM_INVALID;
 }

 }  // namespace ge
--- a/src/common/graph/operator.cc
+++ b/src/common/graph/operator.cc
@@ -30,9 +30,11 @@
 #include "framework/common/debug/ge_log.h"
 #include "graph/compute_graph.h"
 #include "graph/ge_attr_value.h"
 #include "graph/ge_context.h"
 #include "graph/ge_tensor.h"
 #include "graph/node.h"
 #include "graph/op_desc.h"
 #include "graph/runtime_inference_context.h"
 #include "graph/usr_types.h"
 #include "utils/graph_utils.h"
 #include "utils/op_desc_utils.h"
@@ -349,48 +351,54 @@ class OperatorImpl : public std::enable_shared_from_this<OperatorImpl> {

  InferenceContextPtr GetInferenceContext() const { return inference_context_; }

  void SubgraphRegister(const std::string &name, bool dynamic) {
    op_desc_->RegisterSubgraphIrName(name, dynamic ? kDynamic : kStatic);
  void SubgraphRegister(const std::string &ir_name, bool dynamic) {
    op_desc_->RegisterSubgraphIrName(ir_name, dynamic ? kDynamic : kStatic);
  }

  void SubgraphCountRegister(const std::string &name, uint32_t count) {
    if (op_desc_->GetSubgraphTypeByIrName(name) == kStatic) {
      op_desc_->AddSubgraphName(name);
  void SubgraphCountRegister(const std::string &ir_name, uint32_t count) {
    if (op_desc_->GetSubgraphTypeByIrName(ir_name) == kStatic) {
      op_desc_->AddSubgraphName(ir_name);
      subgraph_names_to_builders_[ir_name] = nullptr;
    } else {
      for (uint32_t i = 0; i < count; ++i) {
        op_desc_->AddSubgraphName(name + std::to_string(i));
        string key_name = ir_name + std::to_string(i);
        op_desc_->AddSubgraphName(key_name);
        subgraph_names_to_builders_[key_name] = nullptr;
      }
    }

    subgraph_names_to_builders_[name].resize(count, nullptr);
  }

  void SetSubgraphBuilder(const std::string &name, uint32_t index, const SubgraphBuilder &builder) {
    auto iter = subgraph_names_to_builders_.find(name);
    if (iter == subgraph_names_to_builders_.end()) {
      GELOGE(PARAM_INVALID, "Failed to set subgraph builder for name %s index %u, invalid name", name.c_str(), index);
      return;
  void SetSubgraphBuilder(const std::string &ir_name, uint32_t index, const SubgraphBuilder &builder) {
    string key_name = ir_name;
    if (op_desc_->GetSubgraphTypeByIrName(ir_name) == kDynamic) {
      key_name += std::to_string(index);
    }
    if (iter->second.size() <= index) {
      GELOGE(PARAM_INVALID, "Failed to set subgraph builder for name %s index %u, excceds the max size %zu",
             name.c_str(), index, iter->second.size());

    auto it = subgraph_names_to_builders_.find(key_name);
    if (it == subgraph_names_to_builders_.end()) {
      GELOGE(PARAM_INVALID, "Failed to set subgraph builder for name %s index %u.", ir_name.c_str(), index);
      return;
    }
    iter->second[index] = builder;
    it->second = builder;
  }

  SubgraphBuilder GetSubgraphBuilder(const std::string &name, uint32_t index) const {
  SubgraphBuilder GetSubgraphBuilder(const std::string &ir_name, uint32_t index) const {
    string key_name = ir_name;
    if (op_desc_->GetSubgraphTypeByIrName(ir_name) == kDynamic) {
      key_name += std::to_string(index);
    }

    return GetSubgraphBuilder(key_name);
  }

  SubgraphBuilder GetSubgraphBuilder(const std::string &name) const {
    auto iter = subgraph_names_to_builders_.find(name);
    if (iter == subgraph_names_to_builders_.end()) {
      GELOGE(PARAM_INVALID, "Failed to get subgraph builder for name %s index %u, invalid name", name.c_str(), index);
      return nullptr;
    }
    if (iter->second.size() <= index) {
      GELOGE(PARAM_INVALID, "Failed to get subgraph builder for name %s index %u, excceds the max size %zu",
             name.c_str(), index, iter->second.size());
      GELOGE(PARAM_INVALID, "Failed to get subgraph builder for name %s", name.c_str());
      return nullptr;
    }
    return iter->second[index];

    return iter->second;
  }

  std::vector<std::string> GetSubgraphNames() const {
@@ -408,12 +416,11 @@ class OperatorImpl : public std::enable_shared_from_this<OperatorImpl> {
 private:
  ge::ConstNodePtr node_{nullptr};
  ge::InferenceContextPtr inference_context_;
  GraphBuilderCallback graph_builder_callback_;
  std::map<string, std::vector<OpIO>> output_links_{};
  std::map<string, OpIO> input_link_{};
  std::vector<std::weak_ptr<OperatorImpl>> control_input_link_{};
  std::vector<std::weak_ptr<OperatorImpl>> control_output_link_{};
  std::map<std::string, std::vector<SubgraphBuilder>> subgraph_names_to_builders_;
  std::map<std::string, SubgraphBuilder> subgraph_names_to_builders_;
 };

 // Used to manage OperatorImpl instances created by ge api.
@@ -582,6 +589,17 @@ graphStatus Operator::GetInputConstData(const string &dst_name, Tensor &data) co
        return const_op.GetAttr(op::Const::name_attr_value(), data);
      }
    }

    // Try get from runtime inference context
    auto session_id = std::to_string(GetContext().SessionId());
    RuntimeInferenceContext *runtime_infer_ctx = nullptr;
    if (RuntimeInferenceContext::GetContext(session_id, &runtime_infer_ctx) == GRAPH_SUCCESS) {
      GELOGD("To get constant from runtime inference context. session_id = %s", session_id.c_str());
      auto ret = runtime_infer_ctx->GetTensor(peer_node_ptr->GetOpDesc()->GetId(), out_data_anchor->GetIdx(), data);
      if (ret == GRAPH_SUCCESS) {
        return GRAPH_SUCCESS;
      }
    }
  } else {
    // For outer graph
    return GetInputConstDataOut(dst_name, data);
@@ -1204,25 +1222,27 @@ void Operator::SubgraphCountRegister(const std::string &name, uint32_t count) {
  operator_impl_->SubgraphCountRegister(name, count);
 }

 void Operator::SetSubgraphBuilder(const std::string &name, uint32_t index, const SubgraphBuilder &builder) {
 void Operator::SetSubgraphBuilder(const std::string &ir_name, uint32_t index, const SubgraphBuilder &builder) {
  if (operator_impl_ == nullptr) {
    GELOGE(GRAPH_FAILED, "operator impl is nullptr, name %s.", name.c_str());
    GELOGE(GRAPH_FAILED, "operator impl is nullptr, name %s.", ir_name.c_str());
    return;
  }
  operator_impl_->SetSubgraphBuilder(name, index, builder);
  operator_impl_->SetSubgraphBuilder(ir_name, index, builder);
 }

 std::vector<std::string> Operator::GetSubgraphNames() const { return operator_impl_->GetSubgraphNames(); }

 SubgraphBuilder Operator::GetDynamicSubgraphBuilder(const string &name, uint32_t index) const {
 SubgraphBuilder Operator::GetDynamicSubgraphBuilder(const string &ir_name, uint32_t index) const {
  if (operator_impl_ == nullptr) {
    GELOGE(GRAPH_FAILED, "operator impl is nullptr.");
    return nullptr;
  }
  return operator_impl_->GetSubgraphBuilder(name, index);
  return operator_impl_->GetSubgraphBuilder(ir_name, index);
 }

 SubgraphBuilder Operator::GetSubgraphBuilder(const string &name) const { return GetDynamicSubgraphBuilder(name, 0); }
 SubgraphBuilder Operator::GetSubgraphBuilder(const string &ir_name) const {
  return GetDynamicSubgraphBuilder(ir_name, 0);
 }

 Graph Operator::GetSubgraph(const string &name) const {
  if (operator_impl_ == nullptr) {
@@ -1307,8 +1327,8 @@ class GraphBuilderImpl {
      }
    }
    GE_CHK_BOOL_EXEC(!vec_inputs.empty(), return nullptr,
                     "User Input do not include operator such as \
     Data, Variable operator or operator that has output but no input.");
                     "User Input do not include operator such as "
                     "Data, Variable operator or operator that has output but no input.");
    auto ret = WalkAllOperators(vec_inputs);
    GE_CHK_BOOL_EXEC(ret == GRAPH_SUCCESS, return nullptr, "WalkAllOperators failed.");

@@ -1361,8 +1381,67 @@ class GraphBuilderImpl {
          vec_op_back_forward.push_back(in_link.lock());
        }
        que.push(vec_op_back_forward);

        if (WalkAllSubgraphs(node_ptr, op_impl) != GRAPH_SUCCESS) {
          return GRAPH_FAILED;
        }
      }
    }
    return MoveSubgraphToRoot(graph_);
  }

  graphStatus WalkAllSubgraphs(const NodePtr &node, const OperatorImplPtr &op_impl) {
    const string name = node->GetName();
    for (auto &name_idx : op_impl->op_desc_->GetSubgraphNameIndexes()) {
      const SubgraphBuilder &builder = op_impl->GetSubgraphBuilder(name_idx.first);
      GE_CHK_BOOL_EXEC(builder != nullptr, return GRAPH_FAILED, "Node: %s, Get builder failed.", name.c_str());

      Graph graph = builder();  // Build subgraph from user define builder.
      const ComputeGraphPtr &subgraph = GraphUtils::GetComputeGraph(graph);
      GE_CHK_BOOL_EXEC(subgraph != nullptr, return GRAPH_FAILED, "Node: %s, Build graph failed.", name.c_str());

      subgraph->SetParentNode(node);
      subgraph->SetParentGraph(graph_);
      if (graph_->AddSubgraph(subgraph->GetName(), subgraph) != GRAPH_SUCCESS) {
        return GRAPH_FAILED;
      }

      if (op_impl->op_desc_->SetSubgraphInstanceName(name_idx.second, subgraph->GetName()) != GRAPH_SUCCESS) {
        GELOGE(GRAPH_FAILED, "Failed to set subgraph %s index %u", subgraph->GetName().c_str(), name_idx.second);
        return GRAPH_FAILED;
      }
    }

    return GRAPH_SUCCESS;
  }

  graphStatus MoveSubgraphToRoot(const ComputeGraphPtr &graph) {
    const ComputeGraphPtr &root_graph = GraphUtils::FindRootGraph(graph);
    if (root_graph == nullptr) {
      GELOGE(GRAPH_FAILED, "Graph: %s, Find root graph failed.", graph->GetName().c_str());
      return GRAPH_FAILED;
    }

    if (root_graph == graph) {
      auto subgraphs = graph->GetAllSubgraphs();
      for (auto &subgraph : subgraphs) {
        if (MoveSubgraphToRoot(subgraph) != GRAPH_SUCCESS) {
          return GRAPH_FAILED;
        }
      }
    } else {
      auto subgraphs = graph->GetAllSubgraphs();
      for (auto &subgraph : subgraphs) {
        if (root_graph->AddSubgraph(subgraph->GetName(), subgraph) != GRAPH_SUCCESS) {
          return GRAPH_FAILED;
        }
        graph->RemoveSubgraph(subgraph->GetName());
        if (MoveSubgraphToRoot(subgraph) != GRAPH_SUCCESS) {
          return GRAPH_FAILED;
        }
      }
    }

    return GRAPH_SUCCESS;
  }

@@ -1423,11 +1502,22 @@ class GraphBuilderImpl {
 };

 inline bool HasSameNameNode(const ComputeGraphPtr &compute_graph) {
  for (const auto &graph : compute_graph->GetAllSubgraphs()) {
    std::set<string> node_names;
    for (auto const &node : graph->GetDirectNode()) {
      node_names.insert(node->GetName());
    }

    if (node_names.size() != graph->GetDirectNodesSize()) {
      return true;
    }
  }

  std::set<string> node_names;
  for (auto const &node : compute_graph->GetAllNodes()) {
  for (auto const &node : compute_graph->GetDirectNode()) {
    node_names.insert(node->GetName());
  }
  return node_names.size() != compute_graph->GetAllNodes().size();
  return node_names.size() != compute_graph->GetDirectNodesSize();
 }

 ComputeGraphPtr GraphUtils::CreateGraphFromOperator(const string &name, const vector<ge::Operator> &inputs) {
--- a/src/common/graph/ref_relation.cc
+++ b/src/common/graph/ref_relation.cc
@@ -136,17 +136,11 @@ graphStatus RefRelations::Impl::BuildRefRelationsForBranch(
    out_ref_i_all_refs.emplace_back(cell_root);
    for (const auto &ele : ref_o_net_nodes) {
      RefCell cell_netoutput_in;
      RefCell cell_netoutput_out;
      cell_netoutput_in.node_name = (ele.first)->GetName();
      cell_netoutput_in.node = ele.first;
      cell_netoutput_in.in_out = NODE_IN;
      cell_netoutput_in.in_out_idx = ele.second;
      cell_netoutput_out.node_name = (ele.first)->GetName();
      cell_netoutput_out.node = ele.first;
      cell_netoutput_out.in_out = NODE_OUT;
      cell_netoutput_out.in_out_idx = ele.second;
      out_ref_i_all_refs.emplace_back(cell_netoutput_in);
      out_ref_i_all_refs.emplace_back(cell_netoutput_out);
    }
    node_refs.emplace_back(out_ref_i_all_refs);
    ref_o++;
@@ -155,6 +149,7 @@ graphStatus RefRelations::Impl::BuildRefRelationsForBranch(
 }

 graphStatus RefRelations::Impl::BuildLookUpTables() {
  GELOGD("start to build look up table!");
  for (size_t i = 0; i < values_.size(); i++) {
    vector<vector<RefCell>> &val = values_[i];
    for (const auto &ele : val) {
@@ -216,12 +211,7 @@ graphStatus RefRelations::Impl::BuildRefRelationsForWhile(
      cell_netoutput_in.node = ele.first;
      cell_netoutput_in.in_out = NODE_IN;
      cell_netoutput_in.in_out_idx = ele.second;
      cell_netoutput_out.node_name = (ele.first)->GetName();
      cell_netoutput_out.node = ele.first;
      cell_netoutput_out.in_out = NODE_OUT;
      cell_netoutput_out.in_out_idx = ele.second;
      ref_i_all_refs.emplace_back(cell_netoutput_in);
      ref_i_all_refs.emplace_back(cell_netoutput_out);
    }
    node_refs.emplace_back(ref_i_all_refs);
    ref_i++;
@@ -237,13 +227,10 @@ graphStatus RefRelations::Impl::BuildRelationsWithFuncNodeType(
  auto node_type = root_node->GetType();

  auto status = GRAPH_SUCCESS;
  if (node_type == kIf || node_type == kCase) {
  if (node_type != kWhile) {
    status = BuildRefRelationsForBranch(root_node, classed_data_nodes, classed_netoutput_nodes, node_refs);
  } else if (node_type == kWhile) {
    status = BuildRefRelationsForWhile(root_node, classed_data_nodes, classed_netoutput_nodes, node_refs);
  } else {
    GELOGE(GRAPH_PARAM_INVALID, "Node type [%s] is not supported for build ref relations!", node_type.c_str());
    status = GRAPH_PARAM_INVALID;
    status = BuildRefRelationsForWhile(root_node, classed_data_nodes, classed_netoutput_nodes, node_refs);
  }
  return status;
 }
@@ -291,6 +278,7 @@ graphStatus RefRelations::Impl::GetRootGraph(ge::ComputeGraph &graph, ge::Comput

 graphStatus RefRelations::Impl::ProcessSubgraphDataNodes(vector<NodePtr> &data_nodes,
                                                         vector<vector<NodePtr>> &classed_data_nodes) {
  GELOGD("start to process subgraph data nodes!");
  int max_ref_idx = 0;
  for (const auto &e : data_nodes) {
    int i;
@@ -315,6 +303,7 @@ graphStatus RefRelations::Impl::ProcessSubgraphDataNodes(vector<NodePtr> &data_n

 graphStatus RefRelations::Impl::ProcessSubgraphNetoutput(
  const vector<NodePtr> &netoutput_nodes, vector<vector<std::pair<NodePtr, size_t>>> &classed_netoutput_nodes) {
  GELOGD("[RefRelations]Start to process subgraph netoutput!");
  for (const auto &sub_netoutput_node : netoutput_nodes) {
    auto op_desc = sub_netoutput_node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
@@ -340,6 +329,7 @@ graphStatus RefRelations::Impl::ProcessSubgraphNetoutput(
 }

 graphStatus RefRelations::Impl::BuildRefRelations(ge::ComputeGraph &graph) {
  GELOGD("Start to build ref relations!");
  /* First Step: Get root graph */
  ge::ComputeGraph &root_graph = graph;
  auto status = GetRootGraph(graph, root_graph);
@@ -349,12 +339,12 @@ graphStatus RefRelations::Impl::BuildRefRelations(ge::ComputeGraph &graph) {

  for (const auto &node : graph.GetAllNodes()) {
    auto node_type = node->GetType();
    if (function_op.find(node_type) == function_op.end()) {
      continue;
    }
    std::vector<NodePtr> ref_nodes;
    auto op_desc = node->GetOpDesc();
    auto sub_graph_names = op_desc->GetSubgraphInstanceNames();
    if (sub_graph_names.empty()) {
      continue;
    }
    vector<NodePtr> data_nodes;
    vector<NodePtr> netoutput_nodes;
    // Get data and netoutput of sub_graph
--- a/src/common/graph/runtime_inference_context.cc
+++ b/src/common/graph/runtime_inference_context.cc
@@ -0,0 +1,95 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "graph/runtime_inference_context.h"
 #include <cstdint>
 #include "framework/common/debug/ge_log.h"

 namespace ge {
 std::map<std::string, std::unique_ptr<RuntimeInferenceContext>> RuntimeInferenceContext::contexts_;
 std::mutex RuntimeInferenceContext::ctx_mu_;

 graphStatus RuntimeInferenceContext::CreateContext(const std::string &context_id) {
  GELOGI("To create context. session id = %s", context_id.c_str());
  auto ctx = std::unique_ptr<RuntimeInferenceContext>(new (std::nothrow) RuntimeInferenceContext());
  if (ctx == nullptr) {
    GELOGE(GRAPH_FAILED, "Failed to create instance of RuntimeInferenceContext. context_id = %s", context_id.c_str());
    return GRAPH_FAILED;
  }

  auto emplace_ret = contexts_.emplace(context_id, std::move(ctx));
  if (!emplace_ret.second) {
    GELOGE(GRAPH_FAILED, "Old context not destroyed");
    return GRAPH_FAILED;
  }

  return GRAPH_SUCCESS;
 }

 void RuntimeInferenceContext::DestroyContext(const std::string &context_id) {
  GELOGI("To destroy context. session id = %s", context_id.c_str());
  std::lock_guard<std::mutex> lk(ctx_mu_);
  contexts_.erase(context_id);
 }

 graphStatus RuntimeInferenceContext::GetContext(const std::string &context_id, RuntimeInferenceContext **ctx) {
  std::lock_guard<std::mutex> lk(ctx_mu_);
  auto it = contexts_.find(context_id);
  if (it != contexts_.end()) {
    *ctx = it->second.get();
    return GRAPH_SUCCESS;
  }

  GELOGD("Runtime inference context not created. session id = %s", context_id.c_str());
  return GRAPH_FAILED;
 }

 graphStatus RuntimeInferenceContext::SetTensor(int64_t node_id, int output_id, Tensor &&tensor) {
  std::lock_guard<std::mutex> lk(mu_);
  auto &output_tensors = tensors_[node_id];
  if (static_cast<uint32_t>(output_id) >= output_tensors.size()) {
    output_tensors.resize(output_id + 1);
  }

  GELOGD("Set tensor for node_id = %ld, output_id = %d", node_id, output_id);
  output_tensors[output_id] = std::move(tensor);
  return GRAPH_SUCCESS;
 }

 graphStatus RuntimeInferenceContext::GetTensor(int64_t node_id, int output_id, Tensor &tensor) {
  if (output_id < 0) {
    GELOGE(GRAPH_PARAM_INVALID, "Invalid output index: %d", output_id);
    return GRAPH_PARAM_INVALID;
  }

  std::lock_guard<std::mutex> lk(mu_);
  auto iter = tensors_.find(node_id);
  if (iter == tensors_.end()) {
    GELOGE(INTERNAL_ERROR, "Node not register. Id = %ld", node_id);
    return INTERNAL_ERROR;
  }

  auto &output_tensors = iter->second;
  if (static_cast<uint32_t>(output_id) >= output_tensors.size()) {
    GELOGE(GRAPH_FAILED, "Node output is not registered. node_id = %ld, output index = %d", node_id, output_id);
    return GRAPH_FAILED;
  }

  GELOGD("Get tensor for node_id = %ld, output_id = %d", node_id, output_id);
  tensor = output_tensors[output_id];
  return GRAPH_SUCCESS;
 }
 }  // namespace ge
--- a/src/common/graph/utils/ge_ir_utils.cc
+++ b/src/common/graph/utils/ge_ir_utils.cc
@@ -273,6 +273,9 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const
        auto data_type = TypeUtils::DataTypeToSerialString(input_desc->GetDataType());
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "input_desc_dtype:" + std::to_string(i),
                     &data_type);
        auto data_type_origin = TypeUtils::DataTypeToSerialString(input_desc->GetOriginDataType());
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING,
                     "input_desc_origin_dtype:" + std::to_string(i), &data_type_origin);
        auto dims = input_desc->GetShape().GetDims();
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "input_desc_shape:" + std::to_string(i),
                     &dims);
@@ -346,6 +349,9 @@ void OnnxUtils::AddAttrProtoForOpInAndOutDesc(onnx::NodeProto *node_proto, const
        auto data_type = TypeUtils::DataTypeToSerialString(output_desc->GetDataType());
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING, "output_desc_dtype:" + std::to_string(i),
                     &data_type);
        auto origin_data_type = TypeUtils::DataTypeToSerialString(output_desc->GetOriginDataType());
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_STRING,
                     "output_desc_origin_dtype:" + std::to_string(i), &origin_data_type);
        auto dims = output_desc->GetShape().GetDims();
        AddAttrProto(node_proto, onnx::AttributeProto_AttributeType_INTS, "output_desc_shape:" + std::to_string(i),
                     &dims);
--- a/src/common/graph/utils/graph_utils.cc
+++ b/src/common/graph/utils/graph_utils.cc
@@ -61,6 +61,7 @@ const char *const kDumpGraphLevel = "DUMP_GRAPH_LEVEL";
 const char *const kDumpStrBuild = "Build";
 const char *const kDumpStrPartition = "partition";
 const char *const kDumpStrOptimizeSubgraph = "OptimizeSubGraph";
 const char *const kDumpStrSubgraphFunc = "sub_graph";
 const char *const kDumpStrAicpu = "Aicpu";
 };  // namespace

@@ -203,6 +204,58 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus GraphUtils::InsertNod
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 GraphUtils::RemoveSubgraphRecursively(const ComputeGraphPtr &compute_graph, const NodePtr &remove_node) {
  GE_CHECK_NOTNULL(compute_graph);
  if (remove_node == nullptr) {
    GELOGE(GRAPH_FAILED, "The node ptr should not be null.");
    return GRAPH_FAILED;
  }

  // Check if this node is belong to this compute graph, maybe a little slow
  const auto &all_nodes_in_graph = compute_graph->GetDirectNode();
  if (std::find(all_nodes_in_graph.begin(), all_nodes_in_graph.end(), remove_node) == all_nodes_in_graph.end()) {
    GELOGE(GRAPH_FAILED, "Can not find node %s in graph %s.", remove_node->GetName().c_str(),
           compute_graph->GetName().c_str());
    return GRAPH_FAILED;
  }
  // Find all subgraph of this node
  const auto &root_graph = GraphUtils::FindRootGraph(compute_graph);
  std::vector<ComputeGraphPtr> subgraphs;
  std::vector<NodePtr> all_nodes;
  std::deque<NodePtr> candidates;
  NodePtr remove_node_new = remove_node;
  candidates.emplace_back(remove_node_new);
  while (!candidates.empty()) {
    const NodePtr node = candidates.front();
    all_nodes.emplace_back(node);
    candidates.pop_front();

    OpDescPtr op_desc = node->GetOpDesc();
    if (op_desc == nullptr) {
      continue;
    }

    const auto &subgraph_names = op_desc->GetSubgraphInstanceNames();
    for (auto name_iter = subgraph_names.rbegin(); name_iter != subgraph_names.rend(); ++name_iter) {
      auto subgraph = root_graph->GetSubgraph(*name_iter);
      if (subgraph != nullptr) {
        subgraphs.emplace_back(subgraph);
        candidates.insert(candidates.begin(), subgraph->nodes_.begin(), subgraph->nodes_.end());
      }
    }
  }
  // Remove all subgraph
  for (const auto &remove_graph : subgraphs) {
    if (root_graph->RemoveSubGraph(remove_graph) != GRAPH_SUCCESS) {
      GELOGE(GRAPH_FAILED, "Remove subgraph failed, sub graph name is %s, compute graph is %s.",
             remove_node->GetName().c_str(), compute_graph->GetName().c_str());
      return GRAPH_FAILED;
    }
  }
  return GRAPH_SUCCESS;
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus
 GraphUtils::RemoveNodeWithoutRelink(const ComputeGraphPtr &compute_graph, const NodePtr &node) {
  GE_CHECK_NOTNULL(compute_graph);
  if (node == nullptr) {
@@ -217,12 +270,10 @@ GraphUtils::RemoveNodeWithoutRelink(const ComputeGraphPtr &compute_graph, const
  (void)compute_graph->RemoveOutputNode(node);

  // If the node has sub-graphs, delete them
  auto sub_graph_names = node->GetOpDesc()->GetSubgraphInstanceNames();
  if (!sub_graph_names.empty()) {
    auto root_graph = FindRootGraph(compute_graph);
    for (const auto &name : sub_graph_names) {
      root_graph->RemoveSubgraph(name);
    }
  auto ret = RemoveSubgraphRecursively(compute_graph, node);
  if (ret != GRAPH_SUCCESS) {
    GELOGE(GRAPH_FAILED, "Remove subgraph recursively failed.");
    return GRAPH_FAILED;
  }

  auto iter = find(compute_graph->nodes_.begin(), compute_graph->nodes_.end(), node);
@@ -484,9 +535,10 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY bool GraphUtils::MatchDumpStr(con
    return false;
  }

  if (dump_graph_level == kDumpLevel2 && ((suffix.find(kDumpStrPartition) != std::string::npos) ||
                                          (suffix.find(kDumpStrOptimizeSubgraph) != std::string::npos) ||
                                          (suffix.find(kDumpStrAicpu) != std::string::npos))) {
  if (dump_graph_level == kDumpLevel2 &&
      ((suffix.find(kDumpStrPartition) != std::string::npos) ||
       (suffix.find(kDumpStrOptimizeSubgraph) != std::string::npos) ||
       (suffix.find(kDumpStrAicpu) != std::string::npos) || (suffix.find(kDumpStrSubgraphFunc) != std::string::npos))) {
    return true;
  }

@@ -1026,9 +1078,9 @@ graphStatus ReplaceControlAnchors(const NodePtr &new_node, const NodePtr &old_no
  GE_CHECK_NOTNULL(old_out_control_anchor);
  auto peer_in_anchors = old_out_control_anchor->GetPeerAnchors();
  auto new_out_control_anchor = new_node->GetOutControlAnchor();
  GE_CHECK_NOTNULL(new_out_control_anchor);
  auto exists_in_anchors = new_out_control_anchor->GetPeerAnchors();
  auto exists_in_anchors_set = std::set<AnchorPtr>(exists_in_anchors.begin(), exists_in_anchors.end());
  GE_CHECK_NOTNULL(new_out_control_anchor);
  for (const auto &peer_in_anchor : peer_in_anchors) {
    if (peer_in_anchor != nullptr) {
      if (exists_in_anchors_set.count(peer_in_anchor) > 0) {
@@ -1304,6 +1356,26 @@ graphStatus GraphUtils::GetRefMapping(const ComputeGraphPtr &graph,
  return GRAPH_SUCCESS;
 }

 GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY NodePtr GraphUtils::FindNodeFromAllNodes(ComputeGraphPtr &graph,
                                                                                        const std::string &name) {
  auto root_graph = FindRootGraph(graph);
  if (root_graph == nullptr) {
    GE_LOGE("Failed find node %s, null root graph", name.c_str());
    return nullptr;
  }

  for (const auto &node : root_graph->GetAllNodes()) {
    if (node == nullptr) {
      continue;
    }
    if (node->GetName() == name) {
      return node;
    }
  }

  return nullptr;
 }

 ///
 /// Get reference-mapping for in_data_anchors of node
 /// @param [in] node
@@ -1668,7 +1740,7 @@ bool GraphUtils::IsRefFromInput(const OutDataAnchorPtr &out_data_anchor, int32_t
    for (const auto &input_name : op_desc->GetAllInputNames()) {
      if (!input_name.empty() && (output_name == input_name)) {
        reuse_in_index = op_desc->GetInputIndexByName(input_name);
        GELOGI("Reference name[%s] output[%s][%u] ref to input[%s][%d].", op_desc->GetName().c_str(),
        GELOGI("Reference name[%s] output[%s][%d] ref to input[%s][%d].", op_desc->GetName().c_str(),
               output_name.c_str(), output_index, input_name.c_str(), reuse_in_index);
        return true;
      }
@@ -1694,6 +1766,43 @@ bool GraphUtils::IsRefFromInput(const OutDataAnchorPtr &out_data_anchor, int32_t
 }

 ///
 /// Determine if the graph is a UNKNOWN_SHAPE graph based on whether the graph and all subgraphs
 /// of the graph have UNKNOWN_SHAPE operators or not.
 /// Note: This function will only look 'down' from the graph, not 'up'. For example, the following
 /// scenario (K for known shape, U for unknown shape), ROOT graph is UNKNOWN_SHAPE while SUB graph is KNOWN_SHAPE
 /// ROOT graph:      A -----> B -----> C
 ///                  K    subgraph     U
 ///                           |
 ///                           V
 /// SUB graph:          D --> E --> F
 ///                     K     K     K
 /// @param [in] graph
 /// @return bool
 ///
 bool GraphUtils::IsUnknownShapeGraph(const ComputeGraphPtr &graph) {
  if (graph == nullptr) {
    GELOGW("Input graph is nullptr.");
    return false;
  }
  for (const auto &node : graph->GetDirectNode()) {
    bool is_unknown = false;
    auto ret = NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown);
    if (ret != GRAPH_SUCCESS) {
      GELOGW("Get node unknown status failed, node name:%s, type:%s.", node->GetName().c_str(),
             node->GetType().c_str());
      continue;
    }
    if (is_unknown) {
      GELOGD("Node %s, type %s is unknown shape in graph %s.", node->GetName().c_str(), node->GetType().c_str(),
             graph->GetName().c_str());
      return true;
    }
  }
  GELOGD("Graph %s does not have unknown shape node.", graph->GetName().c_str());
  return false;
 }

 ///
 /// @brief Add node to graph
 /// @param [in] op_desc
 /// @return ComputeGraphBuilder
@@ -1868,6 +1977,17 @@ NodePtr ComputeGraphBuilder::GetNode(const std::string &name) {
  return iter->second;
 }

 /// @brief Get all nodes
 /// @return std::vector<NodePtr>
 ///
 std::vector<NodePtr> ComputeGraphBuilder::GetAllNodes() {
  std::vector<NodePtr> nodes;
  for (const auto &iter : node_names_) {
    nodes.emplace_back(iter.second);
  }
  return nodes;
 }

 ///
 /// @brief Add node to graph
 /// @param [in] op_desc
@@ -1938,6 +2058,16 @@ CompleteGraphBuilder &CompleteGraphBuilder::AddOutput(const std::string &owner_n
 }

 ///
 /// @brief Add target for graph
 /// @param [in] target_name
 /// @return CompleteGraphBuilder
 ///
 CompleteGraphBuilder &CompleteGraphBuilder::AddTarget(const std::string &target_name) {
  graph_targets_.emplace_back(target_name);
  return *this;
 }

 ///
 /// @brief Set parent-node of graph
 /// @param [in] parent_node
 /// @return CompleteGraphBuilder
@@ -2013,6 +2143,11 @@ ComputeGraphPtr CompleteGraphBuilder::Build(graphStatus &error_code, std::string
    return nullptr;
  }

  BuildGraphTargets(error_code, error_msg);
  if (error_code != GRAPH_SUCCESS) {
    return nullptr;
  }

  // ATTR_NAME_SESSION_GRAPH_ID
  std::string graph_id;
  if (!AttrUtils::GetStr(parent_node_->GetOwnerComputeGraph(), ATTR_NAME_SESSION_GRAPH_ID, graph_id)) {
@@ -2211,6 +2346,27 @@ void CompleteGraphBuilder::AddRetValNodes(graphStatus &error_code, std::string &
 }

 ///
 /// @brief Build target-nodes for graph
 /// @param [out] error_code
 /// @param [out] error_msg
 /// @return void
 ///
 void CompleteGraphBuilder::BuildGraphTargets(graphStatus &error_code, std::string &error_msg) {
  std::vector<NodePtr> target_nodes;
  for (const std::string &target_name : graph_targets_) {
    auto target_iter = node_names_.find(target_name);
    if ((target_iter == node_names_.end()) || (target_iter->second == nullptr)) {
      error_code = GRAPH_FAILED;
      error_msg = "BuildGraphTargets failed: target_node " + target_name + " not exist in graph.";
      return;
    }
    target_nodes.emplace_back(target_iter->second);
  }
  owner_graph_->SetGraphTargetNodesInfo(target_nodes);
  return;
 }

 ///
 /// @brief Add node to graph
 /// @param [in] op_desc
 /// @return PartialGraphBuilder
--- a/src/common/graph/utils/node_utils.cc
+++ b/src/common/graph/utils/node_utils.cc
@@ -29,6 +29,13 @@ namespace ge {
 std::map<NodePtr, std::vector<uint32_t>> NodeUtils::map_send_info_{};
 std::map<NodePtr, std::vector<uint32_t>> NodeUtils::map_recv_info_{};

 const std::set<std::string> kConstOpTypes = {"Const", "Constant"};

 const std::set<std::string> kIfOpTypes = {"If", "_If", "StatelessIf"};
 const std::set<std::string> kWhileOpTypes = {"While", "_While", "StatelessWhile"};
 const std::set<std::string> kCaseOpTypes = {"Case"};
 const std::set<std::string> kForOpTypes = {"For"};

 bool OpShapeIsUnknown(const OpDescPtr &desc) {
  for (const auto &ptr : desc->GetAllInputsDescPtr()) {
    auto ge_shape = ptr->GetShape();
@@ -315,6 +322,9 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY graphStatus NodeUtils::UpdatePeer
      peer_input_desc->SetOriginShape(output_tensor.GetOriginShape());
      peer_input_desc->SetDataType(output_tensor.GetDataType());
      peer_input_desc->SetOriginDataType(output_tensor.GetOriginDataType());
      std::vector<std::pair<int64_t, int64_t>> shape_range;
      (void)output_tensor.GetShapeRange(shape_range);
      peer_input_desc->SetShapeRange(shape_range);
      ge::TensorUtils::SetRealDimCnt(*peer_input_desc,
                                     static_cast<uint32_t>(output_tensor.GetShape().GetDims().size()));
      GELOGI("Peer input opdesc name is %s, shape size is %zu, datatype is %d, original datatype is %d",
@@ -477,6 +487,14 @@ bool NodeUtils::IsSubgraphInput(const NodePtr &node) {
    return false;
  }

  auto parent_op_desc = node->GetOwnerComputeGraph()->GetParentNode()->GetOpDesc();
  if (parent_op_desc == nullptr) {
    return false;
  }
  if (AttrUtils::HasAttr(parent_op_desc, ATTR_NAME_IS_UNKNOWN_SHAPE)) {
    return false;
  }

  return node->GetOpDesc()->HasAttr(ATTR_NAME_PARENT_NODE_INDEX);
 }

@@ -491,6 +509,14 @@ bool NodeUtils::IsSubgraphOutput(const NodePtr &node) {
    return false;
  }

  auto parent_op_desc = node->GetOwnerComputeGraph()->GetParentNode()->GetOpDesc();
  if (parent_op_desc == nullptr) {
    return false;
  }
  if (AttrUtils::HasAttr(parent_op_desc, ATTR_NAME_IS_UNKNOWN_SHAPE)) {
    return false;
  }

  for (GeTensorDesc &tensor : node->GetOpDesc()->GetAllInputsDesc()) {
    if (AttrUtils::HasAttr(tensor, ATTR_NAME_PARENT_NODE_INDEX)) {
      return true;
@@ -557,4 +583,58 @@ bool NodeUtils::GetConstOpType(const NodePtr &in_node, std::string &op_type) {

  return false;
 }

 ///
 /// @brief Remove node-related subgraphs, including subgraphs of nodes in the subgraph.
 /// @param [in] node
 /// @return return GRAPH_SUCCESS if remove successfully, other for failed.
 ///
 Status NodeUtils::RemoveSubgraphsOnNode(const NodePtr &node) {
  GE_CHECK_NOTNULL(node);
  auto op_desc = node->GetOpDesc();
  GE_CHECK_NOTNULL(op_desc);
  auto subgraph_names = op_desc->GetSubgraphInstanceNames();
  if (subgraph_names.empty()) {
    return GRAPH_SUCCESS;
  } else {
    auto owner_graph = node->GetOwnerComputeGraph();
    GE_CHECK_NOTNULL(owner_graph);
    auto root_graph = GraphUtils::FindRootGraph(owner_graph);
    GE_CHECK_NOTNULL(root_graph);

    std::unordered_set<std::string> subgraph_to_remove;
    for (auto &subgraph_name : subgraph_names) {
      std::deque<std::string> queue;
      queue.push_back(subgraph_name);
      subgraph_to_remove.insert(subgraph_name);
      op_desc->RemoveSubgraphInstanceName(subgraph_name);
      while (!queue.empty()) {
        auto graph_name = queue.front();
        queue.pop_front();

        auto subgraph = root_graph->GetSubgraph(graph_name);
        GE_CHECK_NOTNULL(subgraph);
        for (const auto &sub_node : subgraph->GetDirectNode()) {
          auto sub_op_desc = sub_node->GetOpDesc();
          GE_CHECK_NOTNULL(sub_op_desc);
          auto sub_names = sub_op_desc->GetSubgraphInstanceNames();
          // Subgraph and all nodes in it will be removed later,
          // no need to remove 'SubgraphInstanceName' in op desc here.
          for (auto &name : sub_names) {
            if (subgraph_to_remove.insert(name).second) {
              queue.push_back(name);
            }
          }
        }
      }
    }
    // Remove subgraph from root_graph
    for (const auto &name : subgraph_to_remove) {
      GELOGI("Remove subgraph:%s.", name.c_str());
      root_graph->RemoveSubgraph(name);
    }
  }

  return GRAPH_SUCCESS;
 }
 }  // namespace ge
--- a/src/common/graph/utils/op_desc_utils.cc
+++ b/src/common/graph/utils/op_desc_utils.cc
@@ -199,6 +199,23 @@ GE_FUNC_DEV_VISIBILITY GE_FUNC_HOST_VISIBILITY vector<ge::NodePtr> OpDescUtils::
    auto in_node = out_anchor->GetOwnerNode();
    if ((in_node->GetType() == CONSTANT) || (in_node->GetType() == CONSTANTOP)) {
      ret.push_back(in_node);
    } else if (in_node->GetType() == DATA) {
      const ComputeGraphPtr &graph = node.GetOwnerComputeGraph();
      GE_CHK_BOOL_EXEC(graph != nullptr, continue, "Owner graph is null");

      const NodePtr &parent_node = graph->GetParentNode();
      if (parent_node == nullptr) {
        continue;  // Root graph.
      }

      if (kWhileOpTypes.count(parent_node->GetType()) > 0) {
        continue;  // Subgraph of While cond or body.
      }

      NodePtr input_node = NodeUtils::GetParentInput(in_node);
      if ((input_node != nullptr) && ((input_node->GetType() == CONSTANT) || (input_node->GetType() == CONSTANTOP))) {
        ret.push_back(input_node);
      }
    }
  }
  return ret;
--- a/src/common/graph/utils/type_utils.cc
+++ b/src/common/graph/utils/type_utils.cc
@@ -17,6 +17,8 @@
 #include "graph/utils/type_utils.h"
 #include "debug/ge_util.h"

 using domi::domiTensorFormat_t;

 namespace ge {
 static const std::map<Format, std::string> kFormatToStringMap = {
  {FORMAT_NCHW, "NCHW"},
@@ -60,6 +62,25 @@ static const std::map<Format, std::string> kFormatToStringMap = {
  {FORMAT_RESERVED, "FORMAT_RESERVED"},
  {FORMAT_ALL, "ALL"}};

 static const std::map<domiTensorFormat_t, Format> kDomiFormatToGeFormat = {
  {domi::DOMI_TENSOR_NCHW, FORMAT_NCHW},
  {domi::DOMI_TENSOR_NHWC, FORMAT_NHWC},
  {domi::DOMI_TENSOR_ND, FORMAT_ND},
  {domi::DOMI_TENSOR_NC1HWC0, FORMAT_NC1HWC0},
  {domi::DOMI_TENSOR_FRACTAL_Z, FORMAT_FRACTAL_Z},
  {domi::DOMI_TENSOR_NC1C0HWPAD, FORMAT_NC1C0HWPAD},
  {domi::DOMI_TENSOR_NHWC1C0, FORMAT_NHWC1C0},
  {domi::DOMI_TENSOR_FSR_NCHW, FORMAT_FSR_NCHW},
  {domi::DOMI_TENSOR_FRACTAL_DECONV, FORMAT_FRACTAL_DECONV},
  {domi::DOMI_TENSOR_BN_WEIGHT, FORMAT_BN_WEIGHT},
  {domi::DOMI_TENSOR_CHWN, FORMAT_CHWN},
  {domi::DOMI_TENSOR_FILTER_HWCK, FORMAT_FILTER_HWCK},
  {domi::DOMI_TENSOR_NDHWC, FORMAT_NDHWC},
  {domi::DOMI_TENSOR_NCDHW, FORMAT_NCDHW},
  {domi::DOMI_TENSOR_DHWCN, FORMAT_DHWCN},
  {domi::DOMI_TENSOR_DHWNC, FORMAT_DHWNC},
  {domi::DOMI_TENSOR_RESERVED, FORMAT_RESERVED}};

 static const std::unordered_set<std::string> kInternalFormat = {"NC1HWC0",
                                                                "FRACTAL_Z",
                                                                "NC1C0HWPAD",
@@ -282,6 +303,15 @@ Format TypeUtils::DataFormatToFormat(const std::string &str) {
  }
 }

 Format TypeUtils::DomiFormatToFormat(domi::domiTensorFormat_t domi_format) {
  auto it = kDomiFormatToGeFormat.find(domi_format);
  if (it != kDomiFormatToGeFormat.end()) {
    return it->second;
  }
  GELOGE(GRAPH_FAILED, "do not find domi Format %d from map", domi_format);
  return FORMAT_RESERVED;
 }

 static inline void CopyDataFromBuffer(vector<uint8_t> &data, const Buffer &buffer) {
  data.clear();
  if (buffer.GetData() != nullptr && buffer.GetSize() != 0) {
--- a/src/ge/CMakeLists.txt
+++ b/src/ge/CMakeLists.txt
@@ -64,6 +64,7 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "common/helper/model_cache_helper.cc"
        "common/profiling/profiling_manager.cc"
        "engine_manager/dnnengine_manager.cc"
        "executor/ge_executor.cc"
        "ge_local_engine/engine/host_cpu_engine.cc"
        "generator/ge_generator.cc"
        "generator/generator_api.cc"
@@ -107,47 +108,61 @@ file(GLOB TRAIN_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "graph/partition/engine_place.cc"
        "graph/partition/graph_partition.cc"
        "graph/passes/*.cc"
        "graph/passes/folding_kernel/add_kernel.cc"
        "graph/passes/folding_kernel/broadcast_args_kernel.cc"
        "graph/passes/folding_kernel/broadcast_gradient_args_kernel.cc"
        "graph/passes/folding_kernel/cast_kernel.cc"
        "graph/passes/folding_kernel/concat_offset_kernel.cc"
        "graph/passes/folding_kernel/concat_v2_kernel.cc"
        "graph/passes/folding_kernel/dynamic_stitch_kernel.cc"
        "graph/passes/folding_kernel/empty_kernel.cc"
        "graph/passes/folding_kernel/expanddims_kernel.cc"
        "graph/passes/folding_kernel/fill_kernel.cc"
        "graph/passes/folding_kernel/floordiv_kernel.cc"
        "graph/passes/folding_kernel/floormod_kernel.cc"
        "graph/passes/folding_kernel/gather_v2_kernel.cc"
        "graph/passes/folding_kernel/greater_kernel.cc"
        "graph/passes/folding_kernel/kernel_utils.cc"
        "graph/passes/folding_kernel/maximum_kernel.cc"
        "graph/passes/folding_kernel/mul_kernel.cc"
        "graph/passes/folding_kernel/pack_kernel.cc"
        "graph/passes/folding_kernel/permute_kernel.cc"
        "graph/passes/folding_kernel/range_kernel.cc"
        "graph/passes/folding_kernel/rank_kernel.cc"
        "graph/passes/folding_kernel/reduce_prod_kernel.cc"
        "graph/passes/folding_kernel/reshape_kernel.cc"
        "graph/passes/folding_kernel/rsqrt_kernel.cc"
        "graph/passes/folding_kernel/shape_kernel.cc"
        "graph/passes/folding_kernel/shape_n_kernel.cc"
        "graph/passes/folding_kernel/size_kernel.cc"
        "graph/passes/folding_kernel/slice_d_kernel.cc"
        "graph/passes/folding_kernel/slice_kernel.cc"
        "graph/passes/folding_kernel/squeeze_kernel.cc"
        "graph/passes/folding_kernel/ssd_prior_box_kernel.cc"
        "graph/passes/folding_kernel/strided_slice_kernel.cc"
        "graph/passes/folding_kernel/sub_kernel.cc"
        "graph/passes/folding_kernel/transdata_kernel.cc"
        "graph/passes/folding_kernel/unpack_kernel.cc"
        "host_kernels/add_kernel.cc"
        "host_kernels/broadcast_args_kernel.cc"
        "host_kernels/broadcast_gradient_args_kernel.cc"
        "host_kernels/cast_kernel.cc"
        "host_kernels/concat_offset_kernel.cc"
        "host_kernels/concat_v2_kernel.cc"
        "host_kernels/dynamic_stitch_kernel.cc"
        "host_kernels/empty_kernel.cc"
        "host_kernels/expanddims_kernel.cc"
        "host_kernels/fill_kernel.cc"
        "host_kernels/floordiv_kernel.cc"
        "host_kernels/floormod_kernel.cc"
        "host_kernels/gather_v2_kernel.cc"
        "host_kernels/greater_kernel.cc"
        "host_kernels/kernel_utils.cc"
        "host_kernels/maximum_kernel.cc"
        "host_kernels/mul_kernel.cc"
        "host_kernels/pack_kernel.cc"
        "host_kernels/permute_kernel.cc"
        "host_kernels/range_kernel.cc"
        "host_kernels/rank_kernel.cc"
        "host_kernels/reduce_prod_kernel.cc"
        "host_kernels/reshape_kernel.cc"
        "host_kernels/rsqrt_kernel.cc"
        "host_kernels/shape_kernel.cc"
        "host_kernels/shape_n_kernel.cc"
        "host_kernels/size_kernel.cc"
        "host_kernels/slice_d_kernel.cc"
        "host_kernels/slice_kernel.cc"
        "host_kernels/squeeze_kernel.cc"
        "host_kernels/ssd_prior_box_kernel.cc"
        "host_kernels/strided_slice_kernel.cc"
        "host_kernels/sub_kernel.cc"
        "host_kernels/transdata_kernel.cc"
        "host_kernels/transpose_kernel.cc"
        "host_kernels/unpack_kernel.cc"
        "graph/preprocess/graph_preprocess.cc"
        "graph/preprocess/insert_op/ge_aipp_op.cc"
        "graph/preprocess/insert_op/util_insert_aipp_op.cc"
        "graph/preprocess/multi_batch_copy_graph.cc"
        "hybrid/common/npu_memory_allocator.cc"
        "hybrid/common/tensor_value.cc"
        "hybrid/executor/*.cc"
        "hybrid/executor/worker/*.cc"
        "hybrid/hybrid_davinci_model.cc"
        "hybrid/model/*.cc"
        "hybrid/node_executor/aicore/*.cc"
        "hybrid/node_executor/aicpu/aicpu_node_executor.cc"
        "hybrid/node_executor/compiledsubgraph/known_node_executor.cc"
        "hybrid/node_executor/hostcpu/ge_local_node_executor.cc"
        "hybrid/node_executor/node_executor.cc"
        "hybrid/node_executor/task_context.cc"
        "init/gelib.cc"
        "model/ge_model.cc"
        "model/ge_root_model.cc"
        "omm/csa_interact.cc"
        "opskernel_manager/ops_kernel_manager.cc"
        "session/inner_session.cc"
@@ -231,42 +246,43 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "graph/partition/engine_place.cc"
        "graph/partition/graph_partition.cc"
        "graph/passes/*.cc"
        "graph/passes/folding_kernel/add_kernel.cc"
        "graph/passes/folding_kernel/broadcast_args_kernel.cc"
        "graph/passes/folding_kernel/broadcast_gradient_args_kernel.cc"
        "graph/passes/folding_kernel/cast_kernel.cc"
        "graph/passes/folding_kernel/concat_offset_kernel.cc"
        "graph/passes/folding_kernel/concat_v2_kernel.cc"
        "graph/passes/folding_kernel/dynamic_stitch_kernel.cc"
        "graph/passes/folding_kernel/empty_kernel.cc"
        "graph/passes/folding_kernel/expanddims_kernel.cc"
        "graph/passes/folding_kernel/fill_kernel.cc"
        "graph/passes/folding_kernel/floordiv_kernel.cc"
        "graph/passes/folding_kernel/floormod_kernel.cc"
        "graph/passes/folding_kernel/gather_v2_kernel.cc"
        "graph/passes/folding_kernel/greater_kernel.cc"
        "graph/passes/folding_kernel/kernel_utils.cc"
        "graph/passes/folding_kernel/maximum_kernel.cc"
        "graph/passes/folding_kernel/mul_kernel.cc"
        "graph/passes/folding_kernel/pack_kernel.cc"
        "graph/passes/folding_kernel/permute_kernel.cc"
        "graph/passes/folding_kernel/range_kernel.cc"
        "graph/passes/folding_kernel/rank_kernel.cc"
        "graph/passes/folding_kernel/reduce_prod_kernel.cc"
        "graph/passes/folding_kernel/reshape_kernel.cc"
        "graph/passes/folding_kernel/rsqrt_kernel.cc"
        "graph/passes/folding_kernel/shape_kernel.cc"
        "graph/passes/folding_kernel/shape_n_kernel.cc"
        "graph/passes/folding_kernel/size_kernel.cc"
        "graph/passes/folding_kernel/slice_d_kernel.cc"
        "graph/passes/folding_kernel/slice_kernel.cc"
        "graph/passes/folding_kernel/squeeze_kernel.cc"
        "graph/passes/folding_kernel/ssd_prior_box_kernel.cc"
        "graph/passes/folding_kernel/strided_slice_kernel.cc"
        "graph/passes/folding_kernel/sub_kernel.cc"
        "graph/passes/folding_kernel/transdata_kernel.cc"
        "graph/passes/folding_kernel/transpose_kernel.cc"
        "graph/passes/folding_kernel/unpack_kernel.cc"
        "host_kernels/add_kernel.cc"
        "host_kernels/broadcast_args_kernel.cc"
        "host_kernels/broadcast_gradient_args_kernel.cc"
        "host_kernels/cast_kernel.cc"
        "host_kernels/concat_offset_kernel.cc"
        "host_kernels/concat_v2_kernel.cc"
        "host_kernels/dynamic_stitch_kernel.cc"
        "host_kernels/empty_kernel.cc"
        "host_kernels/expanddims_kernel.cc"
        "host_kernels/fill_kernel.cc"
        "host_kernels/floordiv_kernel.cc"
        "host_kernels/floormod_kernel.cc"
        "host_kernels/gather_v2_kernel.cc"
        "host_kernels/greater_kernel.cc"
        "host_kernels/kernel_utils.cc"
        "host_kernels/maximum_kernel.cc"
        "host_kernels/mul_kernel.cc"
        "host_kernels/pack_kernel.cc"
        "host_kernels/permute_kernel.cc"
        "host_kernels/range_kernel.cc"
        "host_kernels/rank_kernel.cc"
        "host_kernels/reduce_prod_kernel.cc"
        "host_kernels/reshape_kernel.cc"
        "host_kernels/rsqrt_kernel.cc"
        "host_kernels/shape_kernel.cc"
        "host_kernels/shape_n_kernel.cc"
        "host_kernels/size_kernel.cc"
        "host_kernels/slice_d_kernel.cc"
        "host_kernels/slice_kernel.cc"
        "host_kernels/squeeze_kernel.cc"
        "host_kernels/ssd_prior_box_kernel.cc"
        "host_kernels/strided_slice_kernel.cc"
        "host_kernels/sub_kernel.cc"
        "host_kernels/transdata_kernel.cc"
        "host_kernels/transpose_kernel.cc"
        "host_kernels/unpack_kernel.cc"
        "hybrid/hybrid_davinci_model_stub.cc"
        "graph/preprocess/graph_preprocess.cc"
        "graph/preprocess/insert_op/ge_aipp_op.cc"
        "graph/preprocess/insert_op/util_insert_aipp_op.cc"
@@ -275,6 +291,7 @@ file(GLOB INFER_SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "ir_build/atc_ir_common.cc"
        "ir_build/ge_ir_build.cc"
        "model/ge_model.cc"
        "model/ge_root_model.cc"
        "omm/csa_interact.cc"
        "opskernel_manager/ops_kernel_manager.cc"
        "session/inner_session.cc"
--- a/src/ge/client/ge_api.cc
+++ b/src/ge/client/ge_api.cc
@@ -49,7 +49,7 @@ void GetOpsProtoPath(std::string &opsproto_path) {
  const char *path_env = std::getenv("ASCEND_OPP_PATH");
  if (path_env != nullptr) {
    std::string path = path_env;
    opsproto_path = (path + "/op_proto/built-in/" + ":") + (path + "/op_proto/custom/");
    opsproto_path = (path + "/op_proto/custom/" + ":") + (path + "/op_proto/built-in/");
    GELOGI("Get opsproto so path from env: %s", path.c_str());
    return;
  }
@@ -57,7 +57,7 @@ void GetOpsProtoPath(std::string &opsproto_path) {
  GELOGI("path_base is %s", path_base.c_str());
  path_base = path_base.substr(0, path_base.rfind('/'));
  path_base = path_base.substr(0, path_base.rfind('/') + 1);
  opsproto_path = (path_base + "ops/op_proto/built-in/" + ":") + (path_base + "ops/op_proto/custom/");
  opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/");
 }

 Status CheckDumpAndReuseMemory(const std::map<string, string> &options) {
@@ -103,20 +103,6 @@ Status CheckOptionsValid(const std::map<string, string> &options) {
  return SUCCESS;
 }

 void SaveDdkVersion(const std::map<string, string> &options) {
  auto ddk_option = options.find(DDK_VERSION_FLAG);
  if (ddk_option != options.end()) {
    auto ddk_version = ddk_option->second;
    if (!ddk_version.empty()) {
      GELOGI("Input ddk version : %s.", ddk_version.c_str());
      domi::GetContext().ddk_version = ddk_version;
    }
  } else {
    GELOGW("No ddkVersion!");
    return;
  }
 }

 // Initialize GE, prepare for execution, call GELib::Initialize
 Status GEInitialize(const std::map<string, string> &options) {
  GELOGT(TRACE_INIT, "GEInitialize start");
@@ -146,9 +132,6 @@ Status GEInitialize(const std::map<string, string> &options) {
  }
  GE_TIMESTAMP_END(CheckOptionsValid, "GEInitialize::CheckOptionsValid");

  GE_TIMESTAMP_START(InitPreparation);
  SaveDdkVersion(options);
  GE_TIMESTAMP_END(InitPreparation, "GEInitialize::InitPreparation");
  // call Initialize
  GELOGT(TRACE_RUNNING, "Initializing environment");
  GE_TIMESTAMP_START(GELibInitialize);
--- a/src/ge/common/convert/pb2json.cc
+++ b/src/ge/common/convert/pb2json.cc
@@ -22,6 +22,7 @@
 #include <string>
 #include "securec.h"
 #include "framework/common/fmk_types.h"
 #include "framework/common/debug/ge_log.h"

 using std::set;
 using std::string;
@@ -146,7 +147,10 @@ string Pb2Json::TypeBytes2String(string &field_name, string &type_bytes) {
    uint8_t *value = 0;
    value = reinterpret_cast<uint8_t *>(&temp_value);
    char str[kSignificantDigits];
    sprintf_s(str, kSignificantDigits, "%d", *value);
    if (sprintf_s(str, kSignificantDigits, "%d", *value) == -1) {
      GELOGW("Convert bytes to string fail, filed name:%s", field_name.c_str());
      continue;
    }
    result += str;
  }
  return result;
--- a/src/ge/common/formats/format_transfers/format_transfer_transpose.cc
+++ b/src/ge/common/formats/format_transfers/format_transfer_transpose.cc
@@ -21,6 +21,7 @@

 #include "common/formats/utils/formats_trans_utils.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/debug/log.h"
 #include "graph/utils/type_utils.h"

 namespace ge {
@@ -199,6 +200,23 @@ Status TransposeWithShapeCheck(const uint8_t *data, const std::vector<int64_t> &
  return Transpose(data, src_shape, src_data_type, perm_arg, result);
 }

 Status GetPermByForamt(Format src_format, Format dst_format, std::vector<int64_t> &perm) {
  auto dst_iter = perm_args.find(src_format);
  if (dst_iter == perm_args.end()) {
    GELOGE(UNSUPPORTED, "Failed to trans shape, do not support transpose from format %s to %s",
           TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str());
    return UNSUPPORTED;
  }
  auto iter = dst_iter->second.find(dst_format);
  if (iter == dst_iter->second.end()) {
    GELOGE(UNSUPPORTED, "Failed to trans shape, do not support transpose from format %s to %s",
           TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str());
    return UNSUPPORTED;
  }
  perm = iter->second;
  return SUCCESS;
 }

 Status FormatTransferTranspose::TransFormat(const TransArgs &args, TransResult &result) {
  std::vector<int64_t> expected_shape;
  auto ret = TransShape(args.src_format, args.src_shape, args.src_data_type, args.dst_format, expected_shape);
@@ -218,23 +236,12 @@ Status FormatTransferTranspose::TransFormat(const TransArgs &args, TransResult &

 Status FormatTransferTranspose::TransShape(Format src_format, const std::vector<int64_t> &src_shape, DataType data_type,
                                           Format dst_format, std::vector<int64_t> &dst_shape) {
  auto dst_iter = perm_args.find(src_format);
  if (dst_iter == perm_args.end()) {
    GELOGE(UNSUPPORTED, "Failed to trans shape, do not support transpose from format %s to %s",
           TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str());
    return UNSUPPORTED;
  }
  auto iter = dst_iter->second.find(dst_format);
  if (iter == dst_iter->second.end()) {
    GELOGE(UNSUPPORTED, "Failed to trans shape, do not support transpose from format %s to %s",
           TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str());
    return UNSUPPORTED;
  }

  if (!IsShapeArgValid(src_shape, iter->second)) {
  std::vector<int64_t> perm_arg;
  GE_CHK_STATUS_RET_NOLOG(GetPermByForamt(src_format, dst_format, perm_arg));
  if (!IsShapeArgValid(src_shape, perm_arg)) {
    return PARAM_INVALID;
  }
  dst_shape = TransShapeByPerm(src_shape, iter->second);
  dst_shape = TransShapeByPerm(src_shape, perm_arg);
  return SUCCESS;
 }

--- a/src/ge/common/formats/format_transfers/format_transfer_transpose.h
+++ b/src/ge/common/formats/format_transfers/format_transfer_transpose.h
@@ -31,6 +31,8 @@ Status TransposeWithShapeCheck(const uint8_t *src, const std::vector<int64_t> &s
                               const std::vector<int64_t> &dst_shape, DataType src_data_type,
                               const std::vector<int64_t> &perm_arg, TransResult &result);

 Status GetPermByForamt(Format src_format, Format dst_format, std::vector<int64_t> &perm);

 class FormatTransferTranspose : public FormatTransfer {
 public:
  Status TransFormat(const TransArgs &args, TransResult &result) override;
--- a/src/ge/common/helper/model_helper.cc
+++ b/src/ge/common/helper/model_helper.cc
@@ -180,8 +180,7 @@ ModelHelper::SaveOriginalGraphToOmModel(const ge::Graph &graph, const std::strin
    GELOGE(FAILED, "SaveModel fail for compute_graph null");
    return FAILED;
  }
  ge::GraphUtils::DumpGEGraph(compute_graph, "OriginalGraph");
  ge::GraphUtils::DumpGEGraphToOnnx(*compute_graph, "OriginalGraph");
  GE_DUMP(compute_graph, "OriginalGraph");
  // Model
  ModelPtr model_ptr = ge::MakeShared<ge::Model>();
  GE_CHECK_NOTNULL_EXEC(model_ptr, return MEMALLOC_FAILED);
--- a/src/ge/common/profiling/profiling_manager.cc
+++ b/src/ge/common/profiling/profiling_manager.cc
@@ -74,14 +74,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
      return FAILED;
    }
    // profiling startup first time
    GELOGI("Begin to init profiling, device num %zu", device_id_.size());
    for (size_t i = 0; i < device_id_.size(); ++i) {
      ret = StartProfiling(0, device_id_[i]);
      if (ret != SUCCESS) {
        GELOGE(ret, "Profiling start failed.");
        GELOGE(ret, "Profiling start failed on device %d.", device_id_[i]);
        return FAILED;
      }
      GELOGI("Profiling init succ.");
      GELOGI("Profiling init succ on device %d.", device_id_[i]);
    }
  } else {
    GELOGI("The profiling is off, skip the initialization");
  }
 #endif
  return SUCCESS;
@@ -164,7 +167,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::In
    }

    is_profiling_ = true;
  } catch (Json::parse_error &e) {
  } catch (...) {
    GELOGE(FAILED, "Json conf is not invalid !");
    return ge::PARAM_INVALID;
  }
@@ -274,7 +277,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ge::Status ProfilingManager::St
      ss << start_cfg;
      send_profiling_config_ = ss.str();
      GELOGI("Profiling config %s\n", send_profiling_config_.c_str());
    } catch (Json::parse_error &e) {
    } catch (...) {
      GELOGE(FAILED, "Op trace json conf is not invalid !");
      return FAILED;
    }
--- a/src/ge/common/types.cc
+++ b/src/ge/common/types.cc
@@ -389,6 +389,7 @@ REGISTER_OPTYPE_DEFINE(STREAMMERGE, "StreamMerge");
 REGISTER_OPTYPE_DEFINE(ENDGRAPH, "EndGraph");
 REGISTER_OPTYPE_DEFINE(SEND, "Send");
 REGISTER_OPTYPE_DEFINE(RECV, "Recv");
 REGISTER_OPTYPE_DEFINE(ENDOFSEQUENCE, "EndOfSequence");

 REGISTER_OPTYPE_DEFINE(LABELSET, "LabelSet");
 REGISTER_OPTYPE_DEFINE(LABELGOTO, "LabelGoto");
@@ -456,6 +457,12 @@ REGISTER_OPTYPE_DEFINE(SIGMOIDGRAD, "SigmoidGrad");

 REGISTER_OPTYPE_DEFINE(TRANSSHAPE, "TransShape");

 // Horovod operator
 REGISTER_OPTYPE_DEFINE(HVDCALLBACKALLREDUCE, "HorovodAllreduce");
 REGISTER_OPTYPE_DEFINE(HVDCALLBACKALLGATHER, "HorovodAllgather");
 REGISTER_OPTYPE_DEFINE(HVDCALLBACKBROADCAST, "HorovodBroadcast");
 REGISTER_OPTYPE_DEFINE(HVDWAIT, "HorovodWait");

 const std::string MODEL_ATTR_TASKS = "tasks";
 const std::string MODEL_ATTR_TASK_GEN_BASE_ADDR = "task_gen_base_addr";
 const std::string MODEL_ATTR_TASK_GEN_WEIGHT_ADDR = "task_gen_weight_addr";
--- a/src/ge/common/util.cc
+++ b/src/ge/common/util.cc
@@ -67,8 +67,9 @@ static bool ReadProtoFromCodedInputStream(CodedInputStream &coded_stream, Messag
 }

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromBinaryFile(const char *file, Message *proto) {
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((file == nullptr || proto == nullptr), return false,
                                 "incorrect parameter. nullptr == file || nullptr == proto");
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((file == nullptr || proto == nullptr),
                                 ErrorManager::GetInstance().ATCReportErrMessage("E19001");
                                 return false, "Input parameter file or proto is nullptr!");

  std::string real_path = RealPath(file);
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), return false, "pb file path '%s' not valid", file);
@@ -77,7 +78,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromBinaryFile(co

  std::ifstream fs(real_path, std::ifstream::in | std::ifstream::binary);
  if (!fs.is_open()) {
    GELOGE(ge::FAILED, "Open %s failed.", file);
    ErrorManager::GetInstance().ATCReportErrMessage("E19004", {"realpath"}, {file});
    GELOGE(ge::FAILED, "Open real path[%s] failed.", file);
    return false;
  }

@@ -89,7 +91,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromBinaryFile(co
  fs.close();

  if (!ret) {
    GELOGE(ge::FAILED, "Parse %s failed.", file);
    ErrorManager::GetInstance().ATCReportErrMessage("E19005", {"filepath"}, {file});
    GELOGE(ge::FAILED, "Parse file[%s] failed.", file);
    return ret;
  }

@@ -113,17 +116,17 @@ long GetFileLength(const std::string &input_file) {
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(), return -1, "input_file path '%s' not valid", input_file.c_str());
  unsigned long long file_length = 0;
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mmGetFileSize(input_file.c_str(), &file_length) != EN_OK,
                                 ErrorManager::GetInstance().ATCReportErrMessage("E10037");
                                 return -1, "open file failed.");
                                 ErrorManager::GetInstance().ATCReportErrMessage("E10037", {"filepath"}, {input_file});
                                 return -1, "Open file[%s] failed", input_file.c_str());

  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG((file_length == 0), ErrorManager::GetInstance().ATCReportErrMessage("E10038");
                                 return -1, "file length is 0, not valid.");
                                 return -1, "File[%s] length is 0, not valid.", input_file.c_str());

  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
    file_length > kMaxFileSizeLimit,
    ErrorManager::GetInstance().ATCReportErrMessage("E10039", {"filesize", "maxlen"},
                                                    {std::to_string(file_length), std::to_string(kMaxFileSizeLimit)});
    return -1, "file size %lld is out of limit: %d.", file_length, kMaxFileSizeLimit);
    file_length > kMaxFileSizeLimit, ErrorManager::GetInstance().ATCReportErrMessage(
                                       "E10039", {"filepath", "filesize", "maxlen"},
                                       {input_file, std::to_string(file_length), std::to_string(kMaxFileSizeLimit)});
    return -1, "File[%s] size %lld is out of limit: %d.", input_file.c_str(), file_length, kMaxFileSizeLimit);
  return static_cast<long>(file_length);
 }

@@ -202,7 +205,9 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
  GE_CHK_BOOL_EXEC(!directory_path.empty(), return -1, "directory path is empty.");
  auto dir_path_len = directory_path.length();
  if (dir_path_len >= PATH_MAX) {
    GELOGW("Directory path is too long.");
    ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"},
                                                    {directory_path, std::to_string(PATH_MAX)});
    GELOGW("Path[%s] len is too long, it must smaller than %d", directory_path.c_str(), PATH_MAX);
    return -1;
  }
  char tmp_dir_path[PATH_MAX] = {0};
@@ -213,8 +218,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
        int32_t ret = mmMkdir(tmp_dir_path, S_IRUSR | S_IWUSR | S_IXUSR);  // 700
        if (ret != 0) {
          if (errno != EEXIST) {
            GELOGW("Cannot create directory %s. Make sure that the directory exists and writable.",
                   directory_path.c_str());
            ErrorManager::GetInstance().ATCReportErrMessage("E19006", {"path"}, {directory_path});
            GELOGW("Cannot create directory %s. Make sure the directory exists and writable.", directory_path.c_str());
            return ret;
          }
        }
@@ -224,7 +229,8 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY int CreateDirectory(const std::
  int32_t ret = mmMkdir(const_cast<char *>(directory_path.c_str()), S_IRUSR | S_IWUSR | S_IXUSR);  // 700
  if (ret != 0) {
    if (errno != EEXIST) {
      GELOGW("Cannot create directory %s. Make sure that the directory exists and writable.", directory_path.c_str());
      ErrorManager::GetInstance().ATCReportErrMessage("E19006", {"path"}, {directory_path});
      GELOGW("Cannot create directory %s. Make sure the directory exists and writable.", directory_path.c_str());
      return ret;
    }
  }
@@ -253,16 +259,17 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool ReadProtoFromText(const ch

  std::string real_path = RealPath(file);
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(real_path.empty(),
                                 ErrorManager::GetInstance().ATCReportErrMessage("E10036", {"realpath"}, {file});
                                 return false, "proto file real path '%s' not valid", file);
                                 ErrorManager::GetInstance().ATCReportErrMessage("E10036", {"filepath"}, {file});
                                 return false, "Get path[%s]'s real path failed", file);

  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetFileLength(real_path) == -1, return false, "file size not valid.");

  std::ifstream fs(real_path.c_str(), std::ifstream::in);

  if (!fs.is_open()) {
    ErrorManager::GetInstance().ATCReportErrMessage("E10040", {"protofile"}, {file});
    GELOGE(ge::FAILED, "Fail to open proto file '%s'.", file);
    ErrorManager::GetInstance().ATCReportErrMessage("E10040", {"realpth", "protofile"}, {real_path, file});
    GELOGE(ge::FAILED, "Fail to open proto file real path is '%s' when orginal file path is '%s'.", real_path.c_str(),
           file);
    return false;
  }

@@ -328,18 +335,21 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInt64MulOverflow(int6

 FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY std::string RealPath(const char *path) {
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(path == nullptr, return "", "path pointer is NULL.");
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(strlen(path) >= PATH_MAX, return "", "path is invalid");
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
    strlen(path) >= PATH_MAX,
    ErrorManager::GetInstance().ATCReportErrMessage("E19002", {"filepath", "size"}, {path, std::to_string(PATH_MAX)});
    return "", "Path[%s] len is too long, it must smaller than %d", path, PATH_MAX);
  // PATH_MAX is the system's own macro, indicating the maximum file path length supported
  std::shared_ptr<char> resolved_path(new (std::nothrow) char[PATH_MAX](), std::default_delete<char[]>());
  if (resolved_path == nullptr) {
    GELOGW("new an PATH_MAX string object failed.");
    return "";
  }

  std::string res;
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(
    resolved_path == nullptr,
    ErrorManager::GetInstance().ATCReportErrMessage("E19003", {"filepath", "size"}, {path, std::to_string(PATH_MAX)});
    return "", "Path[%s] new string object len[%d] failed.", path, PATH_MAX);

  // Nullptr is returned when the path does not exist or there is no permission
  // Return absolute path when path is accessible
  std::string res;
  if (realpath(path, resolved_path.get()) != nullptr) {
    res = resolved_path.get();
  }
@@ -360,7 +370,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const
  // Unable to get absolute path (does not exist or does not have permission to access)
  if (real_path.empty()) {
    ErrorManager::GetInstance().ATCReportErrMessage("E10002", {"path", "errmsg"}, {file_path.c_str(), strerror(errno)});
    GELOGW("Can not get real path for %s, %s", file_path.c_str(), strerror(errno));
    GELOGW("Path[%s]'s realpath is empty, errmsg[%s]", file_path.c_str(), strerror(errno));
    return false;
  }

@@ -381,7 +391,7 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckInputPathValid(const
  // The absolute path points to a file that is not readable
  if (access(real_path.c_str(), R_OK) != 0) {
    ErrorManager::GetInstance().ATCReportErrMessage("E10003", {"path", "errmsg"}, {file_path.c_str(), strerror(errno)});
    GELOGW("Read path[%s] failed, %s", file_path.c_str(), strerror(errno));
    GELOGW("Read path[%s] failed, errmsg[%s]", file_path.c_str(), strerror(errno));
    return false;
  }

@@ -416,9 +426,10 @@ FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY bool CheckOutputPathValid(const

    // File is not readable or writable
    if (access(real_path.c_str(), W_OK | F_OK) != 0) {
      ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"path", "errmsg"},
                                                      {real_path.c_str(), strerror(errno)});
      GELOGW("Write file failed, path[%s], %s", real_path.c_str(), strerror(errno));
      ErrorManager::GetInstance().ATCReportErrMessage("E10004", {"realpath", "path", "errmsg"},
                                                      {real_path, file_path, strerror(errno)});
      GELOGW("Write file[%s] failed, input path is %s, errmsg[%s]", real_path.c_str(), file_path.c_str(),
             strerror(errno));
      return false;
    }
  } else {
--- a/src/ge/executor/CMakeLists.txt
+++ b/src/ge/executor/CMakeLists.txt
@@ -59,12 +59,15 @@ file(GLOB SRC_LIST RELATIVE ${CMAKE_CURRENT_LIST_DIR}
        "../graph/load/new_model_manager/tbe_handle_store.cc"
        "../graph/load/new_model_manager/zero_copy_task.cc"
        "../graph/load/output/output.cc"
        "../graph/manager/graph_caching_allocator.cc"
        "../graph/manager/graph_manager_utils.cc"
        "../graph/manager/graph_mem_allocator.cc"
        "../graph/manager/graph_var_manager.cc"
        "../graph/manager/trans_var_data_utils.cc"
        "../graph/manager/util/debug.cc"
        "../hybrid/hybrid_davinci_model_stub.cc"
        "../model/ge_model.cc"
        "../model/ge_root_model.cc"
        "../omm/csa_interact.cc"
        "../single_op/single_op.cc"
        "../single_op/single_op_manager.cc"
--- a/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc
+++ b/src/ge/ge_local_engine/ops_kernel_store/ge_local_ops_kernel_info.cc
@@ -20,6 +20,7 @@
 #include "common/ge/ge_util.h"
 #include "common/ge_inner_error_codes.h"
 #include "framework/common/debug/ge_log.h"
 #include "graph/utils/node_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/utils/type_utils.h"
 #include "op/op_factory.h"
@@ -68,6 +69,15 @@ Status GeLocalOpsKernelInfoStore::CalcOpRunningParam(Node &ge_node) {
    GELOGE(FAILED, "CalcOpRunningParam failed, as op desc is null");
    return FAILED;
  }

  bool is_shape_unknown = false;
  if (NodeUtils::GetNodeUnknownShapeStatus(ge_node, is_shape_unknown) == GRAPH_SUCCESS) {
    if (is_shape_unknown) {
      GELOGI("op:%s is unknown shape, does not need to calc output size.", ge_node.GetName().c_str());
      return SUCCESS;
    }
  }

  const string node_name = ge_node.GetName();
  const string node_type = ge_node.GetType();
  size_t output_size = op_desc->GetOutputsSize();
@@ -157,6 +167,13 @@ Status GeLocalOpsKernelInfoStore::CalcConstantStrMemSize(const OpDescPtr &op_des
 void GeLocalOpsKernelInfoStore::GetAllOpsKernelInfo(map<string, OpInfo> &infos) const { infos = op_info_map_; }

 Status GeLocalOpsKernelInfoStore::GenerateTask(const Node &node, RunContext &context, vector<TaskDef> &tasks) {
  bool is_shape_unknown = false;
  if (NodeUtils::GetNodeUnknownShapeStatus(node, is_shape_unknown) == GRAPH_SUCCESS) {
    if (is_shape_unknown) {
      GELOGI("op:%s is unknown shape, does not need to generate task", node.GetName().c_str());
      return SUCCESS;
    }
  }
  string name = node.GetName();
  string type = node.GetType();
  GELOGD("Ge local generate task for node:%s(%s) begin, tasks.size()=%zu.", name.c_str(), type.c_str(), tasks.size());
--- a/src/ge/ge_runtime/task/hccl_task.cc
+++ b/src/ge/ge_runtime/task/hccl_task.cc
@@ -128,17 +128,17 @@ bool HcclTask::Distribute() {
  ge_task.type = static_cast<uint16_t>(RT_MODEL_TASK_HCCL);
  ge_task.stream = stream_;

  ge_task.kernelHcclInfo.hccl_type = task_info_->hccl_type();
  ge_task.kernelHcclInfo.inputDataAddr = task_info_->input_data_addr();
  ge_task.kernelHcclInfo.outputDataAddr = task_info_->output_data_addr();
  ge_task.kernelHcclInfo.workSpaceAddr = task_info_->workspace_addr();
  ge_task.kernelHcclInfo.workSpaceMemSize = task_info_->workspace_size();
  ge_task.kernelHcclInfo.count = task_info_->count();
  ge_task.kernelHcclInfo.dataType = static_cast<int32_t>(task_info_->data_type());
  ge_task.kernelHcclInfo.opType = static_cast<int32_t>(task_info_->op_type());
  ge_task.kernelHcclInfo.rootId = task_info_->root_id();

  ge_task.kernelHcclInfo.hcclStreamList = slave_stream_list_;
  ge_task.kernelHcclInfo[0].hccl_type = task_info_->hccl_type();
  ge_task.kernelHcclInfo[0].inputDataAddr = task_info_->input_data_addr();
  ge_task.kernelHcclInfo[0].outputDataAddr = task_info_->output_data_addr();
  ge_task.kernelHcclInfo[0].workSpaceAddr = task_info_->workspace_addr();
  ge_task.kernelHcclInfo[0].workSpaceMemSize = task_info_->workspace_size();
  ge_task.kernelHcclInfo[0].count = task_info_->count();
  ge_task.kernelHcclInfo[0].dataType = static_cast<int32_t>(task_info_->data_type());
  ge_task.kernelHcclInfo[0].opType = static_cast<int32_t>(task_info_->op_type());
  ge_task.kernelHcclInfo[0].rootId = task_info_->root_id();

  ge_task.kernelHcclInfo[0].hcclStreamList = slave_stream_list_;

  ge_task.privateDef = private_def;
  ge_task.privateDefLen = private_def_len;
--- a/src/ge/generator/ge_generator.cc
+++ b/src/ge/generator/ge_generator.cc
@@ -27,6 +27,7 @@
 #include "graph/opsproto_manager.h"
 #include "graph/utils/graph_utils.h"
 #include "model/ge_model.h"
 #include "init/gelib.h"

 using std::map;
 using std::string;
@@ -34,9 +35,79 @@ using std::vector;

 namespace {
 const char *const kAttrOpType = "op_type";
 }
 const char *const kEngineNameDefault = "default";
 const char *const kVectorEngine = "VectorEngine";
 const char *const kAIcoreEngine = "AIcoreEngine";
 const char *const kFileNameSuffix = "online";

 std::map<ge::OpEngineType, std::string> engine_type_map{
  {ge::ENGINE_SYS, kEngineNameDefault}, {ge::ENGINE_AICORE, kAIcoreEngine}, {ge::ENGINE_VECTOR, kVectorEngine}};
 }  // namespace

 namespace ge {
 static Status CheckEngineTypeSupport(const OpDescPtr &op_desc, OpEngineType engine_type) {
  GE_CHECK_NOTNULL_EXEC(op_desc, return PARAM_INVALID);
  if (engine_type == ENGINE_SYS) {
    GELOGI("CheckEngineType: use default engine.");
    return SUCCESS;
  }
  // get op engine name
  string op_engine_name;
  auto iter = engine_type_map.find(engine_type);
  if (iter != engine_type_map.end()) {
    op_engine_name = iter->second;
    GELOGI("CheckEngineType: engine type: %d", static_cast<int>(engine_type));
  } else {
    GELOGE(FAILED, "CheckEngineType: engine type: %d not support", static_cast<int>(engine_type));
    return FAILED;
  }
  // set op engine name and opkernelLib. when engine support
  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  if ((instance_ptr == nullptr) || (!instance_ptr->InitFlag())) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "CheckEngineType failed.");
    return FAILED;
  }
  OpsKernelManager &ops_kernel_manager = instance_ptr->OpsKernelManagerObj();
  std::vector<OpInfo> op_infos = ops_kernel_manager.GetOpsKernelInfo(op_desc->GetType());
  if (op_infos.empty()) {
    GELOGE(FAILED, "CheckEngineType: Can not get op info by op type %s", op_desc->GetType().c_str());
    return FAILED;
  }
  string kernel_name;
  for (const auto &it : op_infos) {
    if (it.engine == op_engine_name) {
      kernel_name = it.opKernelLib;
      break;
    }
  }
  if (kernel_name.empty()) {
    GELOGE(FAILED, "CheckEngineType:Can not find ops kernel,engine name: %s.", op_engine_name.c_str());
    return FAILED;
  }
  auto &kernel_map = ops_kernel_manager.GetAllOpsKernelInfoStores();
  auto kernel_info_store = kernel_map.find(kernel_name);
  if (kernel_info_store != kernel_map.end()) {
    std::string unsupported_reason;
    if (kernel_info_store->second->CheckSupported(op_desc, unsupported_reason)) {
      op_desc->SetOpEngineName(op_engine_name);
      op_desc->SetOpKernelLibName(kernel_name);
      GELOGI("CheckEngineType:Set OpKernelLibName %s and engine name %s into op_desc %s", kernel_name.c_str(),
             op_engine_name.c_str(), op_desc->GetName().c_str());
      return SUCCESS;
    } else {
      GELOGE(FAILED, "CheckEngineType: check support failed, Op type %s of ops kernel %s is unsupported, reason:%s",
             op_desc->GetType().c_str(), kernel_name.c_str(), unsupported_reason.c_str());
      return FAILED;
    }
  } else {
    GELOGE(FAILED,
           "CheckEngineType:Can not find any supported ops kernel info store by kernel_name %s,"
           "op type is %s, op name is %s",
           kernel_name.c_str(), op_desc->GetType().c_str(), op_desc->GetName().c_str());
  }
  return FAILED;
 }

 static Status AddInputs(const ComputeGraphPtr &graph, const NodePtr &node, const GeTensorDesc &tensor, int32_t index,
                        bool attr) {
  GE_CHECK_NOTNULL_EXEC(graph, return PARAM_INVALID);
@@ -96,7 +167,7 @@ static Status AddOutputs(const ComputeGraphPtr &graph, const NodePtr &node, cons
 }

 static void GetOpsProtoPath(string &opsproto_path) {
  GELOGI("Start to get ops proto path schedule");
  GELOGI("Start to get ops proto path schedule.");
  const char *path_env = std::getenv("ASCEND_OPP_PATH");
  if (path_env != nullptr) {
    string path = path_env;
@@ -105,7 +176,7 @@ static void GetOpsProtoPath(string &opsproto_path) {
      GELOGE(FAILED, "File path %s is invalid.", path.c_str());
      return;
    }
    opsproto_path = (path + "/op_proto/built-in/" + ":") + (path + "/op_proto/custom/");
    opsproto_path = (path + "/op_proto/custom/" + ":") + (path + "/op_proto/built-in/");
    GELOGI("Get opsproto so path from env : %s", path.c_str());
    return;
  }
@@ -113,15 +184,14 @@ static void GetOpsProtoPath(string &opsproto_path) {
  GELOGI("path_base is %s", path_base.c_str());
  path_base = path_base.substr(0, path_base.rfind('/'));
  path_base = path_base.substr(0, path_base.rfind('/') + 1);
  opsproto_path = (path_base + "ops/op_proto/built-in/" + ":") + (path_base + "ops/op_proto/custom/");
  opsproto_path = (path_base + "ops/op_proto/custom/" + ":") + (path_base + "ops/op_proto/built-in/");
 }

 class GeGenerator::Impl {
 public:
  Status BuildModel(const Graph &graph, const vector<GeTensor> &inputs, GraphId &graph_id,
                    vector<GeModelPtr> &ge_models);
  Status BuildModel(const Graph &graph, const vector<GeTensor> &inputs, GraphId &graph_id, GeRootModelPtr &ge_models);

  Status SaveModel(const string &file_name_prefix, vector<GeModelPtr> &models, ModelBufferData &model);
  Status SaveModel(const string &file_name_prefix, GeModelPtr &models, ModelBufferData &model);

  Status SaveParams(GeModelPtr &ge_model, const string &type, const map<string, GeAttrValue> &attrs,
                    const vector<GeTensor> &inputs, const vector<GeTensor> &outputs);
@@ -141,7 +211,7 @@ Status GeGenerator::Initialize(const map<string, string> &options) {
  }
  string opsproto_path;
  GetOpsProtoPath(opsproto_path);
  GELOGI("opsproto_path is %s", opsproto_path.c_str());
  GELOGI("Get opsproto path is %s", opsproto_path.c_str());
  OpsProtoManager *manager = OpsProtoManager::Instance();
  map<string, string> option_tmp;
  option_tmp.emplace(std::pair<string, string>(string("ge.opsProtoLibPath"), opsproto_path));
@@ -149,7 +219,7 @@ Status GeGenerator::Initialize(const map<string, string> &options) {

  Status ret = impl_->graph_manager_.Initialize(options);
  if (ret != SUCCESS) {
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_INIT_FAILED, "Graph manager initialize failed");
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_INIT_FAILED, "Graph manager initialize failed.");
    return GE_GENERATOR_GRAPH_MANAGER_INIT_FAILED;
  }
  // get ek file
@@ -179,7 +249,7 @@ Status GeGenerator::Finalize() {
  GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID);
  Status ret = impl_->graph_manager_.Finalize();
  if (ret != SUCCESS) {
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_FINALIZE_FAILED, "Graph manager finalize failed");
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_FINALIZE_FAILED, "Graph manager finalize failed.");
    return GE_GENERATOR_GRAPH_MANAGER_FINALIZE_FAILED;
  }
  return SUCCESS;
@@ -187,7 +257,7 @@ Status GeGenerator::Finalize() {

 Status GeGenerator::GenerateOfflineModel(const Graph &graph, const string &file_name_prefix,
                                         const vector<GeTensor> &inputs) {
  GELOGI("Start to GenerateOfflineModel.");
  GELOGI("Start to generate offline model.");
  ModelBufferData model;
  return GenerateModel(graph, file_name_prefix, inputs, model, true);
 }
@@ -208,25 +278,22 @@ Status GeGenerator::GenerateInfershapeGraph(const Graph &graph) {
    }
    return ret;
  }
  GELOGI("GenerateInfershapeJson success.");
  GELOGI("GenerateInfershapeGraph success.");
  return SUCCESS;
 }

 Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_prefix, const vector<GeTensor> &inputs,
                                  ModelBufferData &model, bool is_offline) {
  GraphId graph_id;
  vector<GeModelPtr> ge_models;
  GeRootModelPtr ge_root_model = nullptr;
  GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID);

  string model_name;
  auto compute_graph = GraphUtils::GetComputeGraph(graph);
  if (compute_graph == nullptr) {
    GELOGW("Get compute graph fail.");
  } else {
    model_name = compute_graph->GetName();
  }
  // using output as model_name (ignore ".om")
  int start_position = file_name_prefix.find_last_of('/') + 1;
  int end_position = file_name_prefix.length() - 3;
  const string model_name = file_name_prefix.substr(start_position, end_position - start_position);
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(model_name.empty(), return PARAM_INVALID, "om name is not valid!");
  impl_->is_offline_ = is_offline;
  Status ret = impl_->BuildModel(graph, inputs, graph_id, ge_models);
  Status ret = impl_->BuildModel(graph, inputs, graph_id, ge_root_model);
  if (ret != SUCCESS) {
    GELOGE(ret, "Build model failed");
    if (impl_->graph_manager_.Finalize() != SUCCESS) {
@@ -234,11 +301,14 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
    }
    return ret;
  }

  if (!model_name.empty() && !ge_models.empty()) {
    ge_models[0]->SetName(model_name);
  }
  ret = impl_->SaveModel(file_name_prefix, ge_models, model);
  GE_CHECK_NOTNULL(ge_root_model);
  GE_CHECK_NOTNULL(ge_root_model->GetRootGraph());
  map<string, GeModelPtr> name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
  GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()];

  GE_RETURN_WITH_LOG_IF_FALSE(ge_model != nullptr, "ge_model can not be null");
  ge_model->SetName(model_name);
  ret = impl_->SaveModel(file_name_prefix, ge_model, model);
  if (ret != SUCCESS) {
    GELOGE(ret, "Save model failed");
    if (impl_->graph_manager_.Finalize() != SUCCESS) {
@@ -250,17 +320,9 @@ Status GeGenerator::GenerateModel(const Graph &graph, const string &file_name_pr
  return SUCCESS;
 }

 /**
 * @ingroup ge
 * @brief Compiling a single operator into an offline model
 * @param [in] OpDescPtr &op_desc: Operator description info that needs to be compiled into an offline model file
 * @param [in] vector<GeTensor> &inputs: Operator input data description information.
 * @param [in] vector<GeTensor> &outputs: Operator output data description information.
 * @param [in] const string &model_file_name: Offline model filename.
 * @return SUCCESS handle successfully / others handle failed
 */
 Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor> &inputs,
                                       const vector<GeTensor> &outputs, const string &model_file_name) {
 Status GeGenerator::BuildSingleOp(OpDescPtr &op_desc, const vector<GeTensor> &inputs, const vector<GeTensor> &outputs,
                                  const string &model_file_name, OpEngineType engine_type, ModelBufferData &model_buff,
                                  bool is_offline) {
  GE_CHECK_NOTNULL_EXEC(op_desc, return PARAM_INVALID);
  if (!inputs.empty() && (inputs.size() != op_desc->GetInputsSize())) {
    GELOGE(PARAM_INVALID, "Tensor size: %zu, Inputs size:%zu", inputs.size(), op_desc->GetInputsSize());
@@ -275,7 +337,16 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor
  OpDescPtr op_desc_tmp = AttrUtils::CloneOpDesc(op_desc);
  GE_CHECK_NOTNULL(op_desc_tmp);

  // 1. Create ComputeGraph.
  // 1. check engine type when compile online
  if (model_file_name == kFileNameSuffix) {
    Status ret = CheckEngineTypeSupport(op_desc, engine_type);
    if (ret != SUCCESS) {
      GELOGE(ret, "check engine type failed.");
      return ret;
    }
  }

  // 2. Create ComputeGraph.
  string name = ge::CurrentTimeInStr() + "_" + model_file_name;
  ge::ComputeGraphPtr compute_graph = MakeShared<ComputeGraph>(name);
  if (compute_graph == nullptr) {
@@ -283,9 +354,11 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor
  }
  GE_CHECK_NOTNULL_EXEC(compute_graph, return INTERNAL_ERROR);

  // 2. Add Node to ComputeGraph.
  // 3. Add Node to ComputeGraph.
  NodePtr op_node = compute_graph->AddNode(op_desc);
  GE_CHECK_NOTNULL_EXEC(op_node, return INTERNAL_ERROR);

  // 4. Create InputData node.
  int32_t arg_index = 0;
  if (inputs.empty()) {
    for (const auto &input_desc : op_desc->GetAllInputsDescPtr()) {
@@ -301,7 +374,7 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor
    }
  }

  // 4. Create Output node.
  // 5. Create Output node.
  if (!outputs.empty()) {
    GE_CHK_STATUS_RET_NOLOG(AddOutputs(compute_graph, op_node, outputs));
  }
@@ -312,41 +385,69 @@ Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor
  GELOGI("ATC parser success in single op schedule.");

  GraphId graph_id;
  vector<GeModelPtr> ge_models;
  GeRootModelPtr ge_root_model = nullptr;
  GE_CHECK_NOTNULL_EXEC(impl_, return PARAM_INVALID);
  GE_CHK_STATUS_RET_NOLOG(impl_->BuildModel(graph, inputs, graph_id, ge_models));
  impl_->is_offline_ = is_offline;
  GE_CHK_STATUS_RET_NOLOG(impl_->BuildModel(graph, inputs, graph_id, ge_root_model));
  map<string, GeAttrValue> op_attrs = op_desc_tmp->GetAllAttrs();
  GE_CHECK_NOTNULL(ge_root_model);
  GE_CHECK_NOTNULL(ge_root_model->GetRootGraph());
  map<string, GeModelPtr> name_to_ge_model = ge_root_model->GetSubgraphInstanceNameToModel();
  GeModelPtr &ge_model = name_to_ge_model[ge_root_model->GetRootGraph()->GetName()];
  GELOGD("The opType in op_desc_tmp is: %s", op_desc_tmp->GetType().c_str());
  GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_model, op_desc_tmp->GetType(), op_attrs, inputs, outputs));
  GE_CHK_STATUS_RET_NOLOG(impl_->SaveModel(model_file_name, ge_model, model_buff));
  return SUCCESS;
 }

  if (!ge_models.empty()) {
    map<string, GeAttrValue> op_attrs = op_desc_tmp->GetAllAttrs();
    GELOGI("The opType in op_desc_tmp is: %s", op_desc_tmp->GetType().c_str());
    GE_CHK_STATUS_RET_NOLOG(impl_->SaveParams(ge_models[0], op_desc_tmp->GetType(), op_attrs, inputs, outputs));
  }
 /**
 * @ingroup ge
 * @brief Compiling a single operator into an offline model
 * @param [in] OpDescPtr &op_desc: Operator description info that needs to be compiled into an offline model file
 * @param [in] vector<GeTensor> &inputs: Operator input data description information.
 * @param [in] vector<GeTensor> &outputs: Operator output data description information.
 * @param [in] const string &model_file_name: Offline model filename.
 * @return SUCCESS handle successfully / others handle failed
 */
 Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor> &inputs,
                                       const vector<GeTensor> &outputs, const string &model_file_name) {
  GELOGI("Start to Build Single Op Offline Model.");
  ModelBufferData model_buff;
  GE_CHK_STATUS_RET_NOLOG(impl_->SaveModel(model_file_name, ge_models, model_buff));
  return SUCCESS;
  OpEngineType engine_type = ENGINE_SYS;
  return BuildSingleOp(op_desc, inputs, outputs, model_file_name, engine_type, model_buff, true);
 }

 /**
 * @ingroup ge
 * @brief Compiling a single operator into online buffer
 * @param [in] OpDescPtr &op_desc: Operator description info that needs to be compiled into an offline model file
 * @param [in] vector<GeTensor> &inputs: Operator input data description information.
 * @param [in] vector<GeTensor> &outputs: Operator output data description information.
 * @param [in] engine_type: specific engine.
 * @param [out] ModelBufferData &Model_buff: Model_buff: model buffer of the op.
 * @return SUCCESS handle successfully / others handle failed
 */
 Status GeGenerator::BuildSingleOpModel(OpDescPtr &op_desc, const vector<GeTensor> &inputs,
                                       const vector<GeTensor> &outputs, OpEngineType engine_type,
                                       ModelBufferData &model_buff) {
  GELOGI("Start to Build Single Op Online");
  return BuildSingleOp(op_desc, inputs, outputs, kFileNameSuffix, engine_type, model_buff, false);
 }

 Status GeGenerator::Impl::SaveParams(GeModelPtr &ge_model, const string &type, const map<string, GeAttrValue> &attrs,
                                     const vector<GeTensor> &inputs, const vector<GeTensor> &outputs) {
  GE_CHECK_NOTNULL_EXEC(ge_model, return PARAM_INVALID);
  GE_CHK_BOOL_EXEC_NOLOG(graph_manager_.SaveParams(*ge_model, type, attrs, inputs, outputs) == SUCCESS,
                         graph_manager_.Finalize();
                         (void)graph_manager_.Finalize();
                         return FAILED);

  return SUCCESS;
 }

 Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, vector<GeModelPtr> &models,
                                    ModelBufferData &model_buff) {
  // to be change to ModelHelper interface
  if (models.empty()) {
    GELOGE(FAILED, "models are empty.");
    return FAILED;
  }

 Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, GeModelPtr &model, ModelBufferData &model_buff) {
  ModelHelper model_helper;
  model_helper.SetSaveMode(is_offline_);
  Status ret = model_helper.SaveToOmModel(models[0], save_param_, file_name_prefix, model_buff);
  Status ret = model_helper.SaveToOmModel(model, save_param_, file_name_prefix, model_buff);
  if (ret != SUCCESS) {
    GELOGE(ret, "Save to Om model failed");
    return ret;
@@ -355,20 +456,21 @@ Status GeGenerator::Impl::SaveModel(const string &file_name_prefix, vector<GeMod
 }

 Status GeGenerator::Impl::BuildModel(const Graph &graph, const vector<GeTensor> &inputs, GraphId &graph_id,
                                     vector<GeModelPtr> &ge_models) {
                                     GeRootModelPtr &ge_root_model) {
  static GraphId id = 0;
  const std::map<std::string, std::string> options;
  Status ret = graph_manager_.AddGraph(id, graph, options);
  if (ret != SUCCESS) {
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "graphManager AddGraph failed, id: %u", id);
    graph_manager_.Finalize();
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "GraphManager add graph failed, id: %u", id);
    (void)graph_manager_.Finalize();
    return GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED;
  }

  GELOGI("models' inputs.size()=%zu", inputs.size());
  ret = graph_manager_.BuildGraph(id, inputs, ge_models);
  GELOGI("models inputs.size()=%zu", inputs.size());
  graph_manager_.SetOptionsRunGraphFlag(false);
  ret = graph_manager_.BuildGraph(id, inputs, ge_root_model);
  if (ret != SUCCESS) {
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "graphManager BuildGraph failed, id: %u", id);
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager build graph failed, id: %u", id);
    return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED;
  }

@@ -383,14 +485,14 @@ Status GeGenerator::Impl::GenerateInfershapeGraph(const Graph &graph, GraphId &g
  const std::map<std::string, std::string> options;
  Status ret = graph_manager_.AddGraph(id, graph, options);
  if (ret != SUCCESS) {
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "graphManager AddGraph failed, id: %u", id);
    graph_manager_.Finalize();
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED, "graphManager add graph failed, id: %u", id);
    (void)graph_manager_.Finalize();
    return GE_GENERATOR_GRAPH_MANAGER_ADD_GRAPH_FAILED;
  }

  ret = graph_manager_.GenerateInfershapeGraph(id);
  if (ret != SUCCESS) {
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "graphManager BuildGraph failed, id: %u", id);
    GELOGE(GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED, "GraphManager BuildGraph failed, id: %u", id);
    return GE_GENERATOR_GRAPH_MANAGER_BUILD_GRAPH_FAILED;
  }

--- a/src/ge/graph/build/graph_builder.cc
+++ b/src/ge/graph/build/graph_builder.cc
@@ -53,6 +53,7 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GraphBuilder: GE is not initialized");
    return GE_CLI_GE_NOT_INITIALIZED;
  }

  for (const auto &node_ptr : graph->GetAllNodes()) {
    GE_CHECK_NOTNULL(node_ptr->GetOpDesc());
    std::string kernel_lib_name = node_ptr->GetOpDesc()->GetOpKernelLibName();
@@ -84,76 +85,229 @@ Status GraphBuilder::CalcOpParam(const ge::ComputeGraphPtr &graph) {
      return INTERNAL_ERROR;
    }
  }

  auto parent_node = graph->GetParentNode();
  if (parent_node == nullptr) {
    GELOGI("Graph[%s] do not have parent node, no need update parent node output size.", graph->GetName().c_str());
    return SUCCESS;
  }

  GE_CHK_STATUS_RET(UpdateParentNodeOutputSize(graph, parent_node));
  GELOGI("Success to calculate op running param.");
  return SUCCESS;
 }

 Status GraphBuilder::UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph, ge::NodePtr &parent_node_ptr) {
  GELOGI("Begin to update parent node[%s] of graph[%s] output size.", parent_node_ptr->GetName().c_str(),
         graph->GetName().c_str());
  auto parent_op_desc = parent_node_ptr->GetOpDesc();
  GE_CHECK_NOTNULL(parent_op_desc);
  bool is_unknown_shape = false;
  if (!AttrUtils::GetBool(parent_op_desc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_unknown_shape)) {
    GELOGE(PARAM_INVALID, "Get op %s unknown shape attr failed.", parent_op_desc->GetName().c_str());
    return PARAM_INVALID;
  }
  if (is_unknown_shape) {
    GELOGI("Current graph[%s] is unknown, no need to update parent node[%s] output size.", graph->GetName().c_str(),
           parent_node_ptr->GetName().c_str());
    return SUCCESS;
  }
  for (const auto &node_ptr : graph->GetDirectNode()) {
    if (node_ptr->GetType() != NETOUTPUT) {
      continue;
    }
    auto op_desc = node_ptr->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    for (const auto &in_data_anchor : node_ptr->GetAllInDataAnchors()) {
      auto index = in_data_anchor->GetIdx();
      ge::GeTensorDesc desc_temp = op_desc->GetInputDesc(index);
      int64_t size = 0;
      GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(desc_temp, size) != SUCCESS, GELOGI("Get size failed!"));
      uint32_t parent_index = 0;
      if (!AttrUtils::GetInt(desc_temp, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
        GELOGE(INTERNAL_ERROR, "NetOutput input tensor %d, attr %s not found.", index,
               ATTR_NAME_PARENT_NODE_INDEX.c_str());
        return INTERNAL_ERROR;
      }
      ge::GeTensorDesc parent_desc_temp = parent_op_desc->GetOutputDesc(parent_index);
      ge::TensorUtils::SetSize(parent_desc_temp, size);
      GE_CHK_STATUS_RET(parent_op_desc->UpdateOutputDesc(parent_index, parent_desc_temp));
      GELOGI("Update parent node[%s] output index[%u] to size[%ld].", parent_node_ptr->GetName().c_str(), parent_index,
             size);
    }
  }
  return SUCCESS;
 }

 Status GraphBuilder::Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
                           GeModelPtr &ge_model_ptr, uint64_t session_id) {
                           GeRootModelPtr &ge_root_model_ptr, uint64_t session_id) {
  GELOGI("Start to build model.");
  if (comp_graph == nullptr) {
    GELOGE(GE_GRAPH_PARAM_NULLPTR, "Graph build comp_graph is null.");
    return GE_GRAPH_PARAM_NULLPTR;
  }
  ge_root_model_ptr = MakeShared<ge::GeRootModel>(comp_graph);
  if (ge_root_model_ptr == nullptr) {
    return MEMALLOC_FAILED;
  }
  GeModelPtr ge_model_ptr = nullptr;
  bool is_dynamic_shape = false;
  // To be compatible with the old process, do not verify the return value temporarily.
  (void)AttrUtils::GetBool(comp_graph, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, is_dynamic_shape);
  if (is_dynamic_shape) {
    GE_CHK_STATUS_RET(
      BuildForDynamicShapeGraph(comp_graph, subgraph_ptr_list, ge_root_model_ptr, ge_model_ptr, session_id),
      "Build for dynamic shape graph failed.");
    return SUCCESS;
  }

  GE_CHK_STATUS_RET(BuildForKnownShapeGraph(comp_graph, subgraph_ptr_list, ge_model_ptr, session_id),
                    "Build for known shape graph failed.");
  ge_root_model_ptr->SetSubgraphInstanceNameToModel(comp_graph->GetName(), ge_model_ptr);
  return SUCCESS;
 }

 Status GraphBuilder::BuildForKnownShapeGraph(ComputeGraphPtr &comp_graph,
                                             std::vector<SubGraphInfoPtr> &subgraph_ptr_list, GeModelPtr &ge_model_ptr,
                                             uint64_t session_id) {
  GELOGI("Begin to build known shape graph[%s].", comp_graph->GetName().c_str());
  Status ret = SecondPartition(comp_graph, subgraph_ptr_list);
  GE_CHK_STATUS_RET(ret, "Graph second partition Failed.");
  GE_CHK_STATUS_RET(ret, "Graph[%s] second partition Failed.", comp_graph->GetName().c_str());
  auto subgraph_map = graph_partitioner_.GetSubGraphMap();

  GE_TIMESTAMP_START(BuildSubgraph);
  ge::ModelBuilder builder(comp_graph, subgraph_map, stream_max_parallel_num_, hcom_parallel_, build_mode_);

  GELOGI("[Build] invoke the other opskernel to generate task.");

  GraphUtils::DumpGEGraph(comp_graph, "BeforePreBuildModel");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "BeforePreBuildModel");

  GE_DUMP(comp_graph, "BeforePreBuildModel");
  GE_TIMESTAMP_START(PreBuildModel);
  GE_CHK_STATUS_RET(builder.PreBuildModel(), "Builder PreBuildModel() return fail.");
  GE_CHK_STATUS_RET(builder.PreBuildModel(), "Graph[%s] builder PreBuildModel() return fail.",
                    comp_graph->GetName().c_str());
  GE_TIMESTAMP_END(PreBuildModel, "GraphBuilder::PreBuildModel");

  GraphUtils::DumpGEGraph(comp_graph, "AfterPrebuildmodel");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterPrebuildmodel");

  GE_DUMP(comp_graph, "AfterPreBuildModel");
  GE_TIMESTAMP_START(CalcOpParam);
  GE_CHK_STATUS_RET(CalcOpParam(comp_graph), "Builder CalcOpParam() return fail.");
  GE_CHK_STATUS_RET(CalcOpParam(comp_graph), "Graph[%s] builder CalcOpParam() return fail.",
                    comp_graph->GetName().c_str());
  GE_TIMESTAMP_END(CalcOpParam, "GraphBuilder::CalcOpParam");
  GraphUtils::DumpGEGraph(comp_graph, "AfterCalcOpParam");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterCalcOpParam");
  GE_DUMP(comp_graph, "AfterCalcOpParam");

  ModelPtr model_ptr = MakeShared<ge::Model>();
  if (model_ptr == nullptr) {
    return MEMALLOC_FAILED;
  }
  GE_TIMESTAMP_START(BuildModelForGetTask);
  GE_CHK_STATUS_RET(builder.BuildModelForGetTask(*model_ptr), "Builder BuildModelForGetTask() return fail.");
  GE_CHK_STATUS_RET(builder.BuildModelForGetTask(*model_ptr), "Graph[%s] builder BuildModelForGetTask() return fail.",
                    comp_graph->GetName().c_str());
  GE_TIMESTAMP_END(BuildModelForGetTask, "GraphBuilder::BuildModelForGetTask");

  GraphUtils::DumpGEGraph(comp_graph, "AfterBuildModel");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterBuildModel");
  GE_DUMP(comp_graph, "AfterBuildModel");

  GE_TIMESTAMP_START(GetTaskInfo);
  ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_map, session_id);
  GE_TIMESTAMP_END(GetTaskInfo, "GraphBuilder::GetTaskInfo");

  GraphUtils::DumpGEGraph(comp_graph, "AfterGetTask");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterGetTask");
  GE_DUMP(comp_graph, "AfterGetTask");
  if (ret != SUCCESS) {
    GELOGE(ret, "Builder GetTaskInfo() return fail.");
    GELOGE(ret, "Graph[%s] builder GetTaskInfo() return fail.", comp_graph->GetName().c_str());
    return ret;
  }

  for (auto graph : comp_graph->GetAllSubgraphs()) {
    GraphUtils::DumpGEGraphToOnnx(*graph, "SubgraphGetTask");
  ge_model_ptr = MakeShared<ge::GeModel>();
  if (ge_model_ptr == nullptr) {
    return MEMALLOC_FAILED;
  }
  GE_CHK_STATUS_RET(builder.SaveDataToModel(*model_ptr, *ge_model_ptr),
                    "Graph[%s] builder SaveDataToModel() return fail.", comp_graph->GetName().c_str());
  GELOGI("Success to build graph[%s] model.", comp_graph->GetName().c_str());
  GE_TIMESTAMP_END(BuildSubgraph, "GraphBuilder::Build");
  return SUCCESS;
 }

 Status GraphBuilder::BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                                               uint64_t session_id) {
  GELOGI("Begin to build unknown shape graph[%s].", comp_graph->GetName().c_str());
  GE_TIMESTAMP_START(CalcOpParam);
  GE_CHK_STATUS_RET(CalcOpParam(comp_graph), "Graph[%s] builder CalcOpParam() return fail.",
                    comp_graph->GetName().c_str());
  GE_TIMESTAMP_END(CalcOpParam, "GraphBuilder::CalcOpParam");
  GE_DUMP(comp_graph, "AfterCalcOpParam");
  Graph2SubGraphInfoList subgraph_map;
  ge::ModelBuilder builder(comp_graph, subgraph_map, stream_max_parallel_num_, hcom_parallel_, build_mode_);
  ModelPtr model_ptr = MakeShared<ge::Model>();
  if (model_ptr == nullptr) {
    return MEMALLOC_FAILED;
  }
  GE_TIMESTAMP_START(BuildModelForGetDynShapeTask);
  GE_CHK_STATUS_RET(builder.BuildModelForGetDynShapeTask(*model_ptr),
                    "Graph[%s] builder BuildModelForGetDynShapeTask() return fail.", comp_graph->GetName().c_str());
  GE_TIMESTAMP_END(BuildModelForGetDynShapeTask, "GraphBuilder::BuildModelForGetDynShapeTask");
  GE_TIMESTAMP_START(GetTaskInfo);
  Status ret = GetTaskInfo(builder, model_ptr, comp_graph, subgraph_map, session_id);
  GE_TIMESTAMP_END(GetTaskInfo, "GraphBuilder::GetTaskInfo");

  GraphUtils::DumpGEGraph(comp_graph, "AfterGetTask");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterGetTask");
  if (ret != SUCCESS) {
    GELOGE(ret, "Graph[%s] builder GetTaskInfo() return fail.", comp_graph->GetName().c_str());
    return ret;
  }
  ge_model_ptr = MakeShared<ge::GeModel>();
  if (ge_model_ptr == nullptr) {
    return MEMALLOC_FAILED;
  }
  GE_CHK_STATUS_RET(builder.SaveDataToModel(*model_ptr, *ge_model_ptr), "model builder SaveDataToModel() return fail.");
  GELOGI("Success to build model.");
  GE_TIMESTAMP_END(BuildSubgraph, "GraphBuilder::Build");
  GE_CHK_STATUS_RET(builder.SaveDataToModel(*model_ptr, *ge_model_ptr),
                    "Graph[%s] builder SaveDataToModel() return fail.", comp_graph->GetName().c_str());
  GELOGI("Success to build graph[%s] model.", comp_graph->GetName().c_str());
  return SUCCESS;
 }

 Status GraphBuilder::BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph,
                                               std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
                                               GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr,
                                               uint64_t session_id) {
  GELOGI("Start to build BuildForDynamicShape for dynamic shape.");
  for (const auto &node : comp_graph->GetDirectNode()) {
    auto op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    if (node->GetType() == DATA) {
      GE_CHK_STATUS_RET(CalcDynShapeRootGraphDataSize(op_desc), "Calc dynamic shape root graph data[%s] size failed.",
                        op_desc->GetName().c_str());
    }

    // ATTR_NAME_IS_UNKNOWN_SHAPE is set on "graph partion" stage, but afer fusion , the graph may
    // be changed so here need to renew. For example , the scene followed:
    //             (known)partioncall(known)                              (known)partioncall(known)
    //                                                After fusion
    //                     |                            -->
    // (known)Unique(unknown)--->(unknow)Shape(unknown)                   (known)FuncDef(known)
    // if scene like this , it should be process as known shape graph
    bool is_unknown_shape = false;
    GE_CHK_STATUS_RET(ge::NodeUtils::GetNodeUnknownShapeStatus(*node, is_unknown_shape),
                      "Get node[%s] shape status failed!", node->GetName().c_str());
    if (!is_unknown_shape) {
      GE_CHK_BOOL_EXEC(ge::AttrUtils::SetBool(op_desc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_unknown_shape), return FAILED,
                       "Renew node [%s] attr[%s] failed!", node->GetName().c_str(), ATTR_NAME_IS_UNKNOWN_SHAPE.c_str());
      GELOGD("renew node [%s] attr[%s] success! value is %d", node->GetName().c_str(),
             ATTR_NAME_IS_UNKNOWN_SHAPE.c_str(), is_unknown_shape);
    }

    vector<string> subgraph_names = op_desc->GetSubgraphInstanceNames();
    for (auto subgraph_name : subgraph_names) {
      ComputeGraphPtr subgraph = comp_graph->GetSubgraph(subgraph_name);
      bool is_unknown_shape = false;
      if (!AttrUtils::GetBool(op_desc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_unknown_shape)) {
        GELOGE(PARAM_INVALID, "Get op %s unknown shape attr failed.", op_desc->GetName().c_str());
        return PARAM_INVALID;
      }
      if (is_unknown_shape) {
        // unknown shape build flow
        GE_CHK_STATUS_RET(BuildForUnknownShapeGraph(subgraph, ge_model_ptr, session_id),
                          "Build for unknown shape graph failed.");
      } else {
        // known shape build flow
        GE_CHK_STATUS_RET(BuildForKnownShapeGraph(subgraph, subgraph_ptr_list, ge_model_ptr, session_id),
                          "Build for known shape graph failed.");
      }
      ge_root_model_ptr->SetSubgraphInstanceNameToModel(subgraph_name, ge_model_ptr);
    }
  }
  return SUCCESS;
 }

@@ -199,10 +353,7 @@ Status GraphBuilder::GetTaskInfo(const ge::ModelBuilder &builder, const ModelPtr
    GELOGE(ret, "Optimize streamed subGraph fail.");
    return ret;
  }

  GraphUtils::DumpGEGraph(comp_graph, "AfterOptimizeStreamedSubGraph");
  GraphUtils::DumpGEGraphToOnnx(*comp_graph, "AfterOptimizeStreamedSubGraph");

  GE_DUMP(comp_graph, "AfterOptimizeStreamedSubGraph");
  auto *get_var_mem_base =
    reinterpret_cast<uint8_t *>(reinterpret_cast<uintptr_t>(ge::VarManager::Instance(0)->GetVarMemLogicBase()));
  uint64_t var_size = (ge::VarManager::Instance(session_id)->GetVarMemSize(RT_MEMORY_HBM) > 0)
@@ -289,6 +440,36 @@ Status GraphBuilder::UpdateDataInputSize(const ge::NodePtr &node_ptr) {
  return SUCCESS;
 }

 Status GraphBuilder::CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc) {
  GELOGI("Begin to calc dynamic shape graph data[%s] size.", op_desc->GetName().c_str());
  // data op only has one output anchor
  ge::GeTensorDesc output_desc = op_desc->GetOutputDesc(0);
  int64_t output_size = 0;
  if (ge::TensorUtils::GetSize(output_desc, output_size) != SUCCESS) {
    GELOGW("Get size failed!");
  }

  if (output_size > 0) {
    GELOGI("No need to update dynamic shape graph data output size[%ld].", output_size);
    return SUCCESS;
  } else {
    int64_t real_dim_size = 0;
    ge::graphStatus graph_status = TensorUtils::GetTensorSizeInBytes(output_desc, real_dim_size);
    if (graph_status != GRAPH_SUCCESS) {
      GELOGE(FAILED, "Get tensor size in bytes failed.");
      return FAILED;
    }

    ge::TensorUtils::SetSize(output_desc, real_dim_size);
    GELOGI("Update dynamic shape graph data output size to [%ld].", real_dim_size);
    if (op_desc->UpdateOutputDesc(0, output_desc) != GRAPH_SUCCESS) {
      GELOGE(FAILED, "Update dynamic shape graph data output desc size failed.");
      return FAILED;
    }
  }
  return SUCCESS;
 }

 Status GraphBuilder::SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list) {
  GELOGI("[SecondPartition] second partition.");
  GE_TIMESTAMP_START(GraphPartition2);
--- a/src/ge/graph/build/graph_builder.h
+++ b/src/ge/graph/build/graph_builder.h
@@ -38,6 +38,7 @@
 #include "graph/partition/graph_partition.h"
 #include "graph/utils/graph_utils.h"
 #include "graph/utils/tensor_utils.h"
 #include "model/ge_root_model.h"

 namespace ge {
 class GraphBuilder {
@@ -46,8 +47,8 @@ class GraphBuilder {
  GraphBuilder(const GraphBuilder &in) = delete;
  GraphBuilder &operator=(const GraphBuilder &in) = delete;
  virtual ~GraphBuilder() = default;
  Status Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list, GeModelPtr &ge_model_ptr,
               uint64_t session_id = INVALID_SESSION_ID);
  Status Build(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
               GeRootModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID);
  void SetOptions(const GraphManagerOptions &options);

 private:
@@ -56,8 +57,16 @@ class GraphBuilder {
                     Graph2SubGraphInfoList &subgraph_map, uint64_t session_id = INVALID_SESSION_ID);
  Status SetInputSize(const ge::NodePtr &node_ptr);
  Status UpdateDataInputSize(const ge::NodePtr &node_ptr);
  Status UpdateParentNodeOutputSize(const ge::ComputeGraphPtr &graph, ge::NodePtr &parent_node_ptr);
  Status CalcDynShapeRootGraphDataSize(const ge::OpDescPtr &op_desc);
  Status SecondPartition(ge::ComputeGraphPtr &comp_graph, vector<ge::SubGraphInfoPtr> &subgraph_ptr_list);

  Status BuildForDynamicShapeGraph(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
                                   GeRootModelPtr &ge_root_model_ptr, GeModelPtr &ge_model_ptr,
                                   uint64_t session_id = INVALID_SESSION_ID);
  Status BuildForKnownShapeGraph(ComputeGraphPtr &comp_graph, std::vector<SubGraphInfoPtr> &subgraph_ptr_list,
                                 GeModelPtr &ge_model_ptr, uint64_t session_id = INVALID_SESSION_ID);
  Status BuildForUnknownShapeGraph(ComputeGraphPtr &comp_graph, GeModelPtr &ge_model_ptr,
                                   uint64_t session_id = INVALID_SESSION_ID);
  int build_mode_;

  std::map<std::string, int> stream_max_parallel_num_;
--- a/src/ge/graph/build/logical_stream_allocator.cc
+++ b/src/ge/graph/build/logical_stream_allocator.cc
@@ -512,13 +512,14 @@ Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vector<SubgraphPt
  }

  GELOGI("AllReduceParallelPass is enabled.");
  GraphUtils::DumpGEGraph(graph, "BeforeAllReduceParallel");
  GE_DUMP(graph, "BeforeAllReduceParallel");

  // All successors of HcomAllReduce.
  set<NodePtr> all_reduce_succs;

  for (const NodePtr &node : graph->GetDirectNode()) {
    if (node->GetType() != HCOMALLREDUCE || node->GetInDataNodes().size() <= 1) {
    if ((node->GetType() != HCOMALLREDUCE && node->GetType() != HVDCALLBACKALLREDUCE) ||
        node->GetInDataNodes().size() <= 1) {
      continue;
    }

@@ -534,7 +535,10 @@ Status AllReduceParallelPass::Run(ComputeGraphPtr graph, const vector<SubgraphPt
          string out_stream_label;
          GE_CHECK_NOTNULL(out_node->GetOpDesc());
          (void)AttrUtils::GetStr(out_node->GetOpDesc(), ATTR_NAME_STREAM_LABEL, out_stream_label);
          if (out_stream_label == reduce_stream_label) {
          // normally, Allreduce do not have streamLabel. when in horovod scenario Allreduce will have streamLabel
          bool isSuccessorParallel =
            (out_stream_label == reduce_stream_label) || (!reduce_stream_label.empty() && out_stream_label.empty());
          if (isSuccessorParallel) {
            all_reduce_succs.emplace(out_node);
            all_out_data_nodes.emplace(out_node);
          }
--- a/src/ge/graph/build/memory/block_mem_assigner.cc
+++ b/src/ge/graph/build/memory/block_mem_assigner.cc
@@ -54,13 +54,42 @@ using std::unordered_map;
 using std::unordered_set;
 using std::vector;

 void MemoryBlock::SetHeadOffset(size_t offset) {
  head_offset_ = offset;
  size_t child_offset = head_offset_;
  for (auto block : child_blocks_) {
    if (block != nullptr) {
      block->SetHeadOffset(child_offset);
      child_offset += block->Size();
    }
  }
 }

 void MemoryBlock::SetTailOffset(size_t offset) {
  tail_offset_ = offset;
  size_t child_offset = head_offset_;
  for (auto block : child_blocks_) {
    if (block != nullptr) {
      child_offset += block->Size();
      block->SetTailOffset(child_offset - 1);
    }
  }
 }

 void MemoryBlock::Resize() {
  size_t child_block_size = 0;
  for (auto block : child_blocks_) {
    if (block != nullptr) {
      block->Resize();
      child_block_size += block->Size();
    }
  }
  auto iter = std::max_element(real_size_list_.begin(), real_size_list_.end());
  if (iter == real_size_list_.end()) {
    GELOGW("real_size_list_ is empty");
    return;
  } else {
    size_t block_size = *iter;
    size_t block_size = (child_block_size > *iter) ? child_block_size : *iter;
    if ((block_size > 0) && (block_size % MEM_ALIGN_SIZE != 0)) {
      block_size = (block_size + MEM_ALIGN_SIZE - 1) / MEM_ALIGN_SIZE * MEM_ALIGN_SIZE;
    }
@@ -102,6 +131,68 @@ bool MemoryBlock::IsSameLabel(std::string &first_batch_label) {
  return all_same_label;
 }

 bool CanNotLifeReuse(MemoryBlock *block) {
  if (block == nullptr || !block->reuse_mem_ || block->deleted_block_ || block->continuous_block_ ||
      block->GetLifeEnd() == kMaxLifeTime) {
    return true;
  }
  return false;
 }

 void MemoryBlock::AddLifeReuseBlock(MemoryBlock *block) {
  if (CanNotLifeReuse(this) || CanNotLifeReuse(block)) {
    return;
  }
  MemoryBlock *parent = nullptr;
  MemoryBlock *child = nullptr;
  // merge small block to large block
  if ((block->GetLifeBegin() > GetLifeEnd()) && (block->stream_id_ == stream_id_)) {
    if ((child_offset_ + block->block_size_) <= block_size_) {
      parent = this;
      child = block;
    } else if ((block->child_offset_ + block_size_) <= block->block_size_) {
      parent = block;
      child = this;
    }
  }
  if ((parent != nullptr) && (child != nullptr) && child->child_blocks_.empty()) {
    parent->child_blocks_.emplace_back(child);
    parent->child_offset_ += child->block_size_;
    child->deleted_block_ = true;
    GELOGI(
      "Add block stream id:%ld [size:%zu, life time[begin:%zu, end:%zu]] to"
      " block[size:%zu, life time[begin:%zu, end:%zu]]",
      stream_id_, child->block_size_, child->GetLifeBegin(), child->GetLifeEnd(), parent->block_size_,
      parent->GetLifeBegin(), parent->GetLifeEnd());
  }
 }

 size_t MemoryBlock::GetLifeBegin() {
  size_t life_time = 0;
  if (!node_type_index_list_.empty()) {
    if (node_type_index_list_.front().node != nullptr) {
      auto node_op_desc = node_type_index_list_.front().node->GetOpDesc();
      if (node_op_desc != nullptr) {
        life_time = node_op_desc->GetId();
      }
    }
  }
  return life_time;
 }

 size_t MemoryBlock::GetLifeEnd() {
  if (!node_type_index_list_.empty()) {
    return node_type_index_list_.back().life_time_end;
  }
  return kMaxLifeTime;
 }

 void MemoryBlock::SetLifeTimeEnd(size_t time) {
  if (!node_type_index_list_.empty()) {
    node_type_index_list_.back().life_time_end = time;
  }
 }

 void SetLastUsedInputMemAttr(NodePtr &node, int input_index) {
  if (node == nullptr) {
    return;
@@ -122,6 +213,27 @@ void SetLastUsedInputMemAttr(NodePtr &node, int input_index) {
  }
 }

 Status GetNoAlignSize(const ge::OpDesc &desc, uint32_t index, size_t &size) {
  // calculate tensor real size
  auto output_op_desc = desc.GetOutputDescPtr(index);
  if (output_op_desc == nullptr) {
    GELOGI("GetNoAlignSize failed. OpName: %s, OpType: %s, index: %d", desc.GetName().c_str(), desc.GetType().c_str(),
           index);
    return FAILED;
  }
  int64_t tensor_size = 0;
  GeShape shape = output_op_desc->GetShape();
  Format format = output_op_desc->GetFormat();
  DataType data_type = output_op_desc->GetDataType();
  graphStatus graph_status = TensorUtils::CalcTensorMemSize(shape, format, data_type, tensor_size);
  if (graph_status != GRAPH_SUCCESS) {
    GELOGE(graph_status, "CalcTensorMemSize failed!");
    return FAILED;
  }
  size = static_cast<size_t>(tensor_size);
  return SUCCESS;
 }

 string ToString(ge::NodeTypeIndex &x) {
  stringstream ss;
  ss << "[" << x.node->GetName() << "(" << x.node->GetType() << "), ";
@@ -150,7 +262,7 @@ string MemoryBlock::String() {
 }

 BlockMemAssigner::BlockMemAssigner(ge::ComputeGraphPtr compute_graph)
    : mem_offset_(0), compute_graph_(std::move(compute_graph)) {}
    : mem_offset_(0), compute_graph_(std::move(compute_graph)), life_time_(0) {}

 BlockMemAssigner::~BlockMemAssigner() {
  for (MemoryBlock *memory_block : memory_blocks_) {
@@ -290,8 +402,9 @@ bool CanReuseBySize(const map<string, uint64_t> &reusable_block_counts, const Me

  // continuous memory case:only real_size is maximum can be reused and only one continuous memory in one block
  if (continuous || reusable_block.continuous_block_) {
    auto it = std::max_element(std::begin(reusable_block.RealSizeList()), std::end(reusable_block.RealSizeList()));
    if (it != std::end(reusable_block.RealSizeList())) {
    auto it =
      std::max_element(std::begin(reusable_block.NoAlignSizeList()), std::end(reusable_block.NoAlignSizeList()));
    if (it != std::end(reusable_block.NoAlignSizeList())) {
      GE_IF_BOOL_EXEC((continuous && reusable_block.continuous_block_) || (continuous && (real_size < *it)) ||
                        (reusable_block.continuous_block_ && (real_size > *it)),
                      GELOGD("Conflict current block size:%zu continuous:%d, reuse block max size:%zu continuous:%d",
@@ -498,25 +611,29 @@ void BlockMemAssigner::PrintSymbolMap() {
  }
 }

 MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, MemoryType mem_type, const NodePtr &n,
                                           uint32_t out_index, const vector<bool> &workspace_reuse_flag,
                                           const bool is_op_reuse_mem, const bool continuous) {
 MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size, size_t no_align_size,
                                           MemoryType mem_type, const NodePtr &n, uint32_t out_index,
                                           const vector<bool> &workspace_reuse_flag, const bool is_op_reuse_mem,
                                           const bool continuous) {
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(n == nullptr, return nullptr, "Input parameter n is null.");
  auto node_op_desc = n->GetOpDesc();
  GE_IF_BOOL_EXEC(node_op_desc == nullptr, return nullptr);

  bool is_reuse_memory = false;
  string ge_disable_reuse_mem_env = "0";
  (void)ge::GetContext().GetOption(kDisableReuseMemory, ge_disable_reuse_mem_env);
  if (ge_disable_reuse_mem_env != "1") {
    bool reuse_mem_flag = !((workspace_reuse_flag.size() > out_index) && !workspace_reuse_flag[out_index]);
    bool is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) && reuse_mem_flag && is_op_reuse_mem &&
                           (IsPreReuse(n, out_index));
    is_reuse_memory = !node_op_desc->HasAttr(kL2FusionDynamicConvergeOp) && reuse_mem_flag && is_op_reuse_mem &&
                      (IsPreReuse(n, out_index));
    auto stream_id = node_op_desc->GetStreamId();
    auto map_iter = reusable_streams_map_.find(stream_id);
    if (is_reuse_memory && map_iter != reusable_streams_map_.end()) {
      for (auto it = reusable_blocks_.begin(); it != reusable_blocks_.end(); ++it) {
        MemoryBlock *reusable_block = *it;
        if (!IsPostReuse(reusable_block)) {
          reusable_block->reuse_mem_ = false;
          GELOGI("Unreusable block.");
          continue;
        }

@@ -526,7 +643,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
            CanReuseByStream(map_iter->second, *reusable_block)) {
          GELOGD("Cross stream mem reuse, target stream:%ld, current stream:%ld", reusable_block->stream_id_,
                 stream_id);
          reusable_block->AddNodeTypeIndex({n, mem_type, out_index}, real_size);
          reusable_block->AddNodeTypeIndex({n, mem_type, out_index}, real_size, no_align_size);
          if (mem_type == kOutput) {
            auto iter = anchor_to_symbol_.find(NodeIndexIO(n, out_index, kOut).ToString());
            if (iter != anchor_to_symbol_.end()) {
@@ -543,7 +660,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
    }
  }

  auto block = new (std::nothrow) MemoryBlock(block_size, is_op_reuse_mem);
  auto block = new (std::nothrow) MemoryBlock(block_size, is_reuse_memory);
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "new an object failed.");

  // Data and netoutput need zero copy block
@@ -551,7 +668,7 @@ MemoryBlock *BlockMemAssigner::ApplyMemory(size_t block_size, size_t real_size,
    block->is_zero_copy_ = true;
  }

  block->Init(real_size, mem_type, n, out_index);
  block->Init(real_size, mem_type, n, out_index, no_align_size);
  block->stream_id_ = node_op_desc->GetStreamId();
  block->ref_count_++;
  block->continuous_block_ = continuous;
@@ -577,11 +694,14 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
  if (output_op_desc != nullptr) {
    GE_IF_BOOL_EXEC(ge::TensorUtils::GetSize(*output_op_desc, size) != SUCCESS, GELOGI("Get size failed"));
  }
  size_t no_align_size = 0;
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(GetNoAlignSize(*node_op_desc, index, no_align_size) != SUCCESS, return nullptr,
                                 "Get no align size failed");

  if (IsSymbolExist(node_index_io)) {
    std::string symbol = anchor_to_symbol_[node_index_io.ToString()];
    block = symbol_blocks_[symbol];
    block->AddNodeTypeIndex({n, kOutput, index}, size);
    block->AddNodeTypeIndex({n, kOutput, index}, size, no_align_size);
    block->ref_count_++;
  } else {
    int64_t max_size = size;
@@ -594,7 +714,8 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
    }
    auto block_size = GetBlockSize(max_size, ranges);
    vector<bool> workspace_reuse_flag;
    block = ApplyMemory(block_size, size, kOutput, n, index, workspace_reuse_flag, is_op_reuse_mem, continuous);
    block = ApplyMemory(block_size, size, no_align_size, kOutput, n, index, workspace_reuse_flag, is_op_reuse_mem,
                        continuous);
  }
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(block == nullptr, return nullptr, "Block is nullptr.");
  int out_count_reuse_input = block->ref_count_;
@@ -628,7 +749,7 @@ MemoryBlock *BlockMemAssigner::ApplyOutMemory(const NodePtr &n, uint32_t index,
      GE_IF_BOOL_EXEC(ge::TensorUtils::GetReuseInputIndex(*owner_node_op_desc, dst_reuse_input_index) != SUCCESS,
                      GELOGI("Get dst_reuse_input_index failed"));
      if (dst_reuse_input && (dst_reuse_input_index == static_cast<uint32_t>(in_anchor->GetIdx()))) {
        block->AddNodeTypeIndex({owner_node, kOutput, i}, block->Size());
        block->AddNodeTypeIndex({owner_node, kOutput, i}, block->Size(), block->Size());
        out_count_reuse_input += 1;
        reuse_input = true;
      }
@@ -710,6 +831,7 @@ void BlockMemAssigner::ReleaseMemory(MemoryBlock *to_release, vector<MemoryBlock
  GE_CHK_TRUE_EXEC_INFO(!to_release->reuse_mem_, return, "doesn't reuse memory");
  --to_release->ref_count_;
  if (to_release->ref_count_ == 0) {
    to_release->SetLifeTimeEnd(life_time_);
    reusable_memory.emplace_back(to_release);
    AddReusableBlockCount(*to_release, reusable_block_counts_);
  }
@@ -852,12 +974,11 @@ Status BlockMemAssigner::AssignOutputMemoryWithReuse(const NodePtr &node, vector
      zero_memory_list_.emplace_back(node, kOutput, i);
      continue;
    }
    bool reuse_mem = is_op_reuse_mem_;
    // atomic can't be reused
    if (is_op_reuse_mem_ && out_node_set_continuous_input && is_atomic) {
      reuse_mem = false;
      is_op_reuse_mem_ = false;
    }
    MemoryBlock *mem_block = ApplyOutMemory(node, i, ranges, reuse_mem, out_node_set_continuous_input);
    MemoryBlock *mem_block = ApplyOutMemory(node, i, ranges, is_op_reuse_mem_, out_node_set_continuous_input);
    if (mem_block != nullptr) {
      node_out_blocks_[node->GetName()].emplace_back(mem_block);
      if (out_node_set_continuous_input) {
@@ -894,6 +1015,7 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
  for (NodePtr &n : compute_graph_->GetAllNodes()) {
    auto node_op_desc = n->GetOpDesc();
    GE_IF_BOOL_EXEC(node_op_desc == nullptr, continue);
    life_time_ = node_op_desc->GetId();
    int64_t stream_id = node_op_desc->GetStreamId();
    if (AssignOutputMemoryWithReuse(n, ranges) != SUCCESS) {
      return;
@@ -930,9 +1052,9 @@ void BlockMemAssigner::AssignMemoryWithReuse(vector<int64_t> &ranges) {
        zero_memory_list_.emplace_back(n, kWorkspace, static_cast<uint32_t>(i));
        continue;
      }
      MemoryBlock *mem_block =
        ApplyMemory(GetBlockSize(static_cast<size_t>(temp[i]), ranges), static_cast<size_t>(temp[i]), kWorkspace, n,
                    static_cast<uint32_t>(i), workspace_reuse_flag, is_op_reuse_mem_, false);
      MemoryBlock *mem_block = ApplyMemory(GetBlockSize(static_cast<size_t>(temp[i]), ranges),
                                           static_cast<size_t>(temp[i]), static_cast<size_t>(temp[i]), kWorkspace, n,
                                           static_cast<uint32_t>(i), workspace_reuse_flag, is_op_reuse_mem_, false);
      GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(mem_block == nullptr, continue, "failed to apply memory block.");
      CheckWorkspaceReuse(workspace_reuse_flag, i, stream_id, mem_block);
    }
@@ -1001,7 +1123,8 @@ void MergeBlocks(std::vector<MemoryBlock *> &dest, std::vector<MemoryBlock *> &s
        dest[i]->AddSymbol(symbol);
      }
      for (size_t j = 0; j < src[i]->NodeTypeIndexList().size(); ++j) {
        dest[i]->AddNodeTypeIndex(src[i]->NodeTypeIndexList()[j], src[i]->RealSizeList()[j]);
        dest[i]->AddNodeTypeIndex(src[i]->NodeTypeIndexList()[j], src[i]->RealSizeList()[j],
                                  src[i]->NoAlignSizeList()[j]);
        src[i]->deleted_block_ = true;
      }
    }
@@ -1115,6 +1238,21 @@ void BlockMemAssigner::AssignContinuousBlocks() {
  }
 }

 void BlockMemAssigner::ReuseBlocksByLifeTime() {
  for (size_t i = 0; i < memory_blocks_.size(); ++i) {
    auto parent = memory_blocks_[i];
    if (parent == nullptr || parent->deleted_block_) {
      continue;
    }
    if (parent->reuse_mem_ && !IsPostReuse(parent)) {
      parent->reuse_mem_ = false;
    }
    for (size_t j = i + 1; j < memory_blocks_.size(); ++j) {
      parent->AddLifeReuseBlock(memory_blocks_[j]);
    }
  }
 }

 ///
 /// @ingroup domi_omg
 /// @brief traverse memory size, resize, calculate offset
@@ -1129,8 +1267,8 @@ void BlockMemAssigner::ResizeMemoryBlocks() {
    memory_block->SetHeadOffset(mem_offset_);
    mem_offset_ += memory_block->Size();
    memory_block->SetTailOffset(mem_offset_ - 1);
    GELOGI("mem_offset_ exclude zero_copy_memory is %zu.", mem_offset_);
  }
  GELOGI("mem_offset_ exclude zero_copy_memory is %zu.", mem_offset_);
 }

 ///
@@ -1142,15 +1280,18 @@ void BlockMemAssigner::ResizeMemoryBlocks() {
 /// @param [in] real_size memory size in need
 /// @return Status result
 ///
 void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t size, size_t real_size) {
  ge::OpDescPtr op_desc = node_type_index.node->GetOpDesc();
 void SetOffsetSize(const NodeTypeIndex &node_type, const MemoryBlock *block, size_t real_size, size_t no_align_size,
                   bool child_block) {
  ge::OpDescPtr op_desc = node_type.node->GetOpDesc();
  GE_CHK_BOOL_TRUE_EXEC_WITH_LOG(op_desc == nullptr, return, "op_desc is null.");
  string graph_name = node_type_index.node->GetOwnerComputeGraph()->GetName();
  string graph_name = node_type.node->GetOwnerComputeGraph()->GetName();
  vector<int64_t> memorys_type;
  int64_t offset = block->HeadOffset();
  size_t end = node_type.life_time_end;
  bool has_mem_type_attr = ge::AttrUtils::GetListInt(op_desc, ATTR_NAME_OUTPUT_MEM_TYPE_LIST, memorys_type);
  if (node_type_index.mem_type == kOutput) {
  if (node_type.mem_type == kOutput) {
    vector<int64_t> output_list = op_desc->GetOutputOffset();
    for (auto i = static_cast<uint32_t>(output_list.size()); i < node_type_index.index + 1; i++) {
    for (auto i = static_cast<uint32_t>(output_list.size()); i < node_type.index + 1; i++) {
      output_list.emplace_back(kInvalidOffset);
    }
    if (output_list.empty()) {
@@ -1160,39 +1301,56 @@ void SetOffsetSize(const NodeTypeIndex &node_type_index, int64_t offset, size_t

    if ((op_desc->GetType() == DATA) || (op_desc->GetType() == AIPP_DATA_TYPE) || (op_desc->GetType() == MULTISHAPE) ||
        (op_desc->GetType() == NETOUTPUT)) {
      if ((output_list[node_type_index.index] == kInvalidOffset) || (output_list[node_type_index.index] < offset)) {
        output_list.at(node_type_index.index) = offset;
      if ((output_list[node_type.index] == kInvalidOffset) || (output_list[node_type.index] < offset)) {
        output_list.at(node_type.index) = offset;
      }
    } else {
      // fusion: keep the original other type offset value from op_desc
      bool set_out_offset = (!has_mem_type_attr) || (memorys_type[node_type_index.index] != RT_MEMORY_L1);
      bool set_out_offset = (!has_mem_type_attr) ||
                            (memorys_type.size() > node_type.index && memorys_type[node_type.index] != RT_MEMORY_L1);
      if (set_out_offset) {
        output_list.at(node_type_index.index) = offset;
        output_list.at(node_type.index) = offset;
      }
    }
    op_desc->SetOutputOffset(output_list);
    GELOGI("[IMAS]Set %s name[%s] output[%d] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu].",
           graph_name.c_str(), op_desc->GetName().c_str(), node_type_index.index, offset, op_desc->GetStreamId(), size,
           real_size);
  } else if (node_type_index.mem_type == kWorkspace) {
  } else if (node_type.mem_type == kWorkspace) {
    vector<int64_t> workspace_list;
    workspace_list = op_desc->GetWorkspace();
    for (auto i = static_cast<uint32_t>(workspace_list.size()); i < node_type_index.index + 1; i++) {
    for (auto i = static_cast<uint32_t>(workspace_list.size()); i < node_type.index + 1; i++) {
      workspace_list.emplace_back(kInvalidOffset);
    }
    vector<int64_t> workspace_memory_type;
    bool has_workspace_mem_type_attr =
      ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_memory_type);
    vector<int64_t> workspace_mem_type;
    bool has_workspace_mem_type = ge::AttrUtils::GetListInt(op_desc, TVM_ATTR_NAME_WORKSPACE_TYPE, workspace_mem_type);
    // fusion: keep the original other type offset value from op_desc
    bool set_workspace_offset =
      (!has_workspace_mem_type_attr) || (workspace_memory_type[node_type_index.index] != RT_MEMORY_L1);
    bool set_workspace_offset = (!has_workspace_mem_type) || (workspace_mem_type.size() > node_type.index &&
                                                              workspace_mem_type[node_type.index] != RT_MEMORY_L1);
    if (set_workspace_offset) {
      workspace_list.at(node_type_index.index) = offset;
      workspace_list.at(node_type.index) = offset;
    }
    op_desc->SetWorkspace(workspace_list);
    GELOGI("[IMAS]Set %s name[%s] workspace[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu].",
           graph_name.c_str(), op_desc->GetName().c_str(), node_type_index.index, offset, op_desc->GetStreamId(), size,
           real_size);
  }
  GELOGI(
    "[IMAS]Set %s name[%s] %s[%u] offset to [%ld] streamid[%ld] size[%zu] realsize[%zu]"
    " noalignsize[%zu] life time begin[%zu] life time end[%zu]  child[%d].",
    graph_name.c_str(), op_desc->GetName().c_str(), node_type.GetMemType().c_str(), node_type.index, offset,
    op_desc->GetStreamId(), block->Size(), real_size, no_align_size, op_desc->GetId(), end, child_block);
 }

 void SetBlockOpMemOffset(MemoryBlock *block, bool child_block) {
  if (block == nullptr) {
    return;
  }
  size_t index = 0;
  size_t real_size = 0;
  size_t no_align_size = 0;
  auto real_size_list_size = block->RealSizeList().size();
  for (const NodeTypeIndex &node_type_index : block->NodeTypeIndexList()) {
    if (index < real_size_list_size) {
      real_size = block->RealSizeList()[index];
      no_align_size = block->NoAlignSizeList()[index];
    }
    SetOffsetSize(node_type_index, block, real_size, no_align_size, child_block);
    index++;
  }
 }

@@ -1206,21 +1364,16 @@ void BlockMemAssigner::SetOpMemOffset(bool is_zero_copy) {
      continue;
    }

    size_t index = 0;
    size_t real_size = 0;
    auto real_size_list_size = memory_block->RealSizeList().size();
    for (const NodeTypeIndex &node_type_index : memory_block->NodeTypeIndexList()) {
      if (index < real_size_list_size) {
        real_size = memory_block->RealSizeList()[index];
      }
      SetOffsetSize(node_type_index, memory_block->HeadOffset(), memory_block->Size(), real_size);
      index++;
    SetBlockOpMemOffset(memory_block, false);
    for (MemoryBlock *child_block : memory_block->ChildBlockList()) {
      SetBlockOpMemOffset(child_block, true);
    }
  }

  if (!is_zero_copy) {
    for (const NodeTypeIndex &node_type_index : zero_memory_list_) {
      SetOffsetSize(node_type_index, 0, 0, 0);
      MemoryBlock block(0, 0);
      SetOffsetSize(node_type_index, &block, 0, 0, false);
    }
  }
 }
@@ -1290,7 +1443,7 @@ void BlockMemAssigner::FindHeadAndTailNodesForStream(map<int64_t, pair<NodePtr,
    for (size_t i = 0; i < n->GetOpDesc()->GetOutputsSize(); i++) {
      int64_t size = 0;
      if (ge::TensorUtils::GetSize(*n->GetOpDesc()->GetOutputDescPtr(static_cast<uint32_t>(i)), size) != SUCCESS) {
        GELOGW("Get output size failed！");
        GELOGW("Get output size failed!");
        continue;
      }
      stream_mem_map[stream_id] += size;
@@ -1375,6 +1528,6 @@ void BlockMemAssigner::FindDependentStreamBetweenGraphs(const NodePtr &pre_node,
 bool BlockMemAssigner::CheckIsZeroMemNodeType(const string &node_type) const {
  return (node_type == VARIABLE) || (node_type == CONSTANT) || (node_type == MULTISHAPE) ||
         (node_type == HCOMBROADCAST) || (node_type == HCOMALLREDUCE) || (node_type == CONSTANTOP) ||
         (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN);
         (node_type == ASSIGNADD) || (node_type == ASSIGNSUB) || (node_type == ASSIGN) || (node_type == HVDWAIT);
 }
 }  // namespace ge
--- a/src/ge/graph/build/memory/block_mem_assigner.h
+++ b/src/ge/graph/build/memory/block_mem_assigner.h
@@ -31,6 +31,8 @@
 #include "graph/utils/graph_utils.h"

 namespace ge {
 const size_t kMaxLifeTime = 0xffffffff;

 enum MemoryType { kOutput, kWorkspace };

 struct NodeTypeIndex {
@@ -40,6 +42,15 @@ struct NodeTypeIndex {
  ge::NodePtr node = nullptr;
  MemoryType mem_type = kOutput;
  uint32_t index = 0;
  size_t life_time_end = kMaxLifeTime;
  const string GetMemType() const {
    if (mem_type == kOutput) {
      return "output";
    } else if (mem_type == kWorkspace) {
      return "workspace";
    }
    return "unknown";
  }
 };

 class MemoryBlock {
@@ -55,7 +66,8 @@ class MemoryBlock {
        is_zero_copy_(false),
        block_size_(block_size),
        head_offset_(0),
        tail_offset_(0) {}
        tail_offset_(0),
        child_offset_(0) {}

  MemoryBlock(const MemoryBlock &) = delete;

@@ -66,23 +78,25 @@ class MemoryBlock {
    symbol_list_.clear();
  }

  void Init(size_t real_size, MemoryType type, const ge::NodePtr &node, uint32_t out_index) {
  void Init(size_t real_size, MemoryType type, const ge::NodePtr &node, uint32_t out_index, size_t no_align_size) {
    real_size_list_.emplace_back(real_size);
    no_align_size_list_.emplace_back(no_align_size);
    node_type_index_list_.emplace_back(node, type, out_index);
  }
  size_t Size() const { return block_size_; }

  void SetHeadOffset(size_t offset) { head_offset_ = offset; }
  void SetHeadOffset(size_t offset);

  void SetTailOffset(size_t offset) { tail_offset_ = offset; }
  void SetTailOffset(size_t offset);

  size_t HeadOffset() const { return head_offset_; }

  size_t TailOffset() const { return tail_offset_; }

  void AddNodeTypeIndex(const NodeTypeIndex &node_type_index, size_t real_size) {
  void AddNodeTypeIndex(const NodeTypeIndex &node_type_index, size_t real_size, size_t no_align_size) {
    node_type_index_list_.emplace_back(node_type_index);
    real_size_list_.emplace_back(real_size);
    no_align_size_list_.emplace_back(no_align_size);
  }

  void AddSymbol(const std::string &symbol) { symbol_list_.emplace_back(symbol); }
@@ -90,6 +104,8 @@ class MemoryBlock {
  const std::vector<NodeTypeIndex> &NodeTypeIndexList() const { return node_type_index_list_; }
  const std::vector<std::string> &SymbolList() const { return symbol_list_; }
  const std::vector<size_t> &RealSizeList() const { return real_size_list_; }
  const std::vector<MemoryBlock *> &ChildBlockList() const { return child_blocks_; }
  const std::vector<size_t> &NoAlignSizeList() const { return no_align_size_list_; }

  void Resize();

@@ -97,6 +113,14 @@ class MemoryBlock {

  bool IsSameLabel(std::string &first_batch_label);

  void AddLifeReuseBlock(MemoryBlock *block);

  void SetLifeTimeEnd(size_t time);

  size_t GetLifeBegin();

  size_t GetLifeEnd();

  int ref_count_;
  int64_t stream_id_;
  bool deleted_block_;
@@ -109,10 +133,13 @@ class MemoryBlock {
 private:
  size_t block_size_;
  std::vector<size_t> real_size_list_;
  std::vector<size_t> no_align_size_list_;
  size_t head_offset_;
  size_t tail_offset_;
  size_t child_offset_;
  std::vector<NodeTypeIndex> node_type_index_list_;
  std::vector<std::string> symbol_list_;
  std::vector<MemoryBlock *> child_blocks_;
 };

 class BlockMemAssigner : public MemAssigner {
@@ -292,8 +319,8 @@ class BlockMemAssigner : public MemAssigner {
  /// @return MemoryBlock*
  /// @author
  ///
  MemoryBlock *ApplyMemory(size_t block_size, size_t real_size, MemoryType mem_type, const ge::NodePtr &n,
                           uint32_t out_index, const std::vector<bool> &workspace_reuse_flag,
  MemoryBlock *ApplyMemory(size_t block_size, size_t real_size, size_t no_align_size, MemoryType mem_type,
                           const ge::NodePtr &n, uint32_t out_index, const std::vector<bool> &workspace_reuse_flag,
                           const bool is_op_reuse_mem, const bool continuous);

  ///
@@ -354,6 +381,17 @@ class BlockMemAssigner : public MemAssigner {
  bool IsOutNodeSetContinuousInput(const NodePtr &n, uint32_t out_index, std::string &peer_name,
                                   uint32_t &peer_input_index);

  ///
  /// @ingroup GE
  /// @|+++++++++block1++++++++|                               |+++++++++block1++++++++|
  /// @|+++++++++block1++++++++||++block2++|                   |+++++++++block1++++++++||++block2++|
  /// @                         |++block2++||++block3++|  ==>  |++block3++|             |++block2++|
  /// @                                     |++block3++|       |++block3++|
  /// @return void
  /// @author
  ///
  void ReuseBlocksByLifeTime();

  std::vector<MemoryBlock *> reusable_blocks_;

  std::map<std::string, uint64_t> reusable_block_counts_;
@@ -380,6 +418,8 @@ class BlockMemAssigner : public MemAssigner {

  bool is_op_reuse_mem_ = true;

  size_t life_time_;

  int64_t atomic_addr_clean_id_ = 0;
 };
 }  // namespace ge
--- a/src/ge/graph/build/memory/graph_mem_assigner.cc
+++ b/src/ge/graph/build/memory/graph_mem_assigner.cc
@@ -245,13 +245,14 @@ Status GraphMemoryAssigner::AssignZeroCopyMemory(size_t &mem_offset, size_t &zer
    memory_block->SetHeadOffset(mem_offset);
    mem_offset += memory_block->Size();
    memory_block->SetTailOffset(mem_offset - 1);
    GELOGI("mem_offset_ include zero_copy_memory is %zu.", mem_offset);
  }
  GELOGI("mem_offset_ include zero_copy_memory is %zu.", mem_offset);

  // set offset for zero copy nodes
  priority_assigner->SetOpMemOffset(true);

  zero_mem_copy_size = mem_offset - mem_offset_tmp;
  memory_offset_[0].mem_offset_ = mem_offset;

  GELOGI("max_mem_offset:%zu, mem_offset:%zu, zero_mem_copy_size:%zu.", mem_offset, mem_offset_tmp, zero_mem_copy_size);

  return SUCCESS;
@@ -360,8 +361,11 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
                    return PARAM_INVALID;);

    vector<int64_t> output_list = peer_op_desc->GetOutputOffset();
    std::vector<int64_t> offsets_for_fusion = {};
    bool has_offset_attr =
      AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion);
    if (peer_out_data_anchor->GetIdx() < static_cast<int>(output_list.size())) {
      if (continuous_input_alloc) {
      if (continuous_input_alloc && !has_offset_attr) {
        if (in_data_anchor->GetIdx() == 0) {
          continuous_mem_start = output_list.at(peer_out_data_anchor->GetIdx());
        }
@@ -391,9 +395,7 @@ Status GraphMemoryAssigner::AssignContinuousInputMemory(const ge::NodePtr &node,
    }
    peer_op_desc->SetOutputOffset(output_list);
    size_t pre_mem_offset = memory_offset_[0].mem_offset_;
    std::vector<int64_t> offsets_for_fusion = {};
    bool has_offset_attr =
      AttrUtils::GetListInt(peer_op_desc, ATTR_NAME_OUTPUT_OFFSET_FOR_BUFFER_FUSION, offsets_for_fusion);

    int64_t tensor_desc_size = 0;
    if (has_offset_attr) {
      if (peer_out_data_anchor->GetIdx() < static_cast<int>(offsets_for_fusion.size())) {
@@ -1232,7 +1234,7 @@ ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node, vector<
 ge::Status GraphMemoryAssigner::UpdateOpInputOffset(const NodePtr &node) const {
  GE_CHECK_NOTNULL(node->GetOpDesc());
  vector<int64_t> input_list;
  if (node->GetType() == HCOMBROADCAST) {
  if (node->GetType() == HCOMBROADCAST || node->GetType() == HVDCALLBACKBROADCAST) {
    for (const auto &anchor : node->GetAllInDataAnchors()) {
      vector<int64_t> output_list;
      auto peer_out_anchor = anchor->GetPeerOutAnchor();
--- a/src/ge/graph/build/memory/var_mem_assign_util.cc
+++ b/src/ge/graph/build/memory/var_mem_assign_util.cc
@@ -208,7 +208,7 @@ Status VarMemAssignUtil::DealVariableNode(uint32_t graph_id, const ge::NodePtr &
  for (const ge::OutDataAnchorPtr &var_out_data_anchor : node->GetAllOutDataAnchors()) {
    for (const ge::InDataAnchorPtr &dst_in_data_anchor : var_out_data_anchor->GetPeerInDataAnchors()) {
      ge::NodePtr dst_node = dst_in_data_anchor->GetOwnerNode();
      if (dst_node->GetType() == HCOMBROADCAST) {
      if (dst_node->GetType() == HCOMBROADCAST || dst_node->GetType() == HVDCALLBACKBROADCAST) {
        GE_CHK_STATUS_RET(DealBroadCastNode(graph_id, dst_node, dst_in_data_anchor, node, session_id));
        continue;
      }
--- a/src/ge/graph/build/model_builder.cc
+++ b/src/ge/graph/build/model_builder.cc
@@ -412,7 +412,9 @@ Status ModelBuilder::BuildModelDef(ge::Model &model) {
  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetInt(&model, ATTR_MODEL_ZERO_COPY_MEMORY_SIZE, zero_copy_mem_size_),
                   GELOGE(FAILED, "SetInt of ATTR_MODEL_ZERO_COPY_MEMORY_SIZE failed.");
                   return FAILED);

  GE_CHK_BOOL_EXEC(ge::AttrUtils::SetListStr(&model, ATTR_MODEL_OUT_NODES_NAME, domi::GetContext().net_out_nodes),
                   GELOGE(FAILED, "SetListStr of ATTR_MODEL_OUT_NODES_NAME failed.");
                   return FAILED);
  GELOGI("For model, max_mem_offset_: %zu, zero_copy_mem_size_: %zu", max_mem_offset_, zero_copy_mem_size_);

  string ge_core_type;
@@ -651,6 +653,14 @@ Status ModelBuilder::BuildModelForGetTask(ge::Model &model) {
  return SUCCESS;
 }

 Status ModelBuilder::BuildModelForGetDynShapeTask(ge::Model &model_def) {
  GE_TIMESTAMP_START(BuildModelDef);
  GE_CHK_STATUS_RET(BuildModelDef(model_def), "BuildModelDef failed!");
  GE_TIMESTAMP_END(BuildModelDef, "GraphBuilder::BuildModelDef");
  SetModelVersion(model_def);
  return SUCCESS;
 }

 ge::Buffer ModelBuilder::GetWeightBuffer() const { return weight_buffer_; }
 Status ModelBuilder::CompileSingleOp() {
  GELOGD("Begin to compile single op.");
--- a/src/ge/graph/build/model_builder.h
+++ b/src/ge/graph/build/model_builder.h
@@ -50,6 +50,7 @@ class ModelBuilder {
  Status SaveDataToModel(ge::Model &model, ge::GeModel &ge_model);
  Status PreBuildModel();
  Status BuildModelForGetTask(ge::Model &model_def);
  ge::Status BuildModelForGetDynShapeTask(ge::Model &model_def);

  ge::Buffer GetWeightBuffer() const;

--- a/src/ge/graph/build/stream_allocator.cc
+++ b/src/ge/graph/build/stream_allocator.cc
--- a/src/ge/graph/build/stream_allocator.h
+++ b/src/ge/graph/build/stream_allocator.h
@@ -40,41 +40,47 @@ class StreamAllocator {
  const vector<int64_t> &GetHugeStreams() const { return huge_streams_; }

 private:
  Status SplitStreams(std::vector<std::set<int64_t>> &split_streams);

  Status AssignSingleStream();

  Status SetActiveStreamsByLabel();
  Status UpdateActiveStreams(const std::vector<std::set<int64_t>> &splited_streams);
  void UpdateLabelStreams(const std::vector<std::set<int64_t>> &split_streams);
  Status SetActiveStreamsForSubgraph();
  Status SetActiveStreamsForLoop();
  Status CheckStreamActived() const;
  Status GetMaxStreamAndTask(bool huge_stream, uint32_t &max_stream_count, uint32_t &max_task_count);
  int64_t GetMaxNodeNumPerStream(const NodePtr &node, uint32_t max_node_num_one_stream);
  Status SetActiveStreamsForSubgraphs();

  Status InsertSyncEvents();
  Status InsertOneEventInTwoNodes(const NodePtr &cur_node_ptr, const NodePtr &next_node_ptr);
  Status InsertEventsForSubgraph();

  Status OptimizeSyncEvents();
  Status OptimizeBySendEvents(const std::map<int64_t, std::vector<NodePtr>> &stream_nodes);
  Status OptimizeByRecvEvents(const std::map<int64_t, std::vector<NodePtr>> &stream_nodes);
  Status OptimizeByStreamActivate();
  // Determine if the successor node of RecvNode is directly or indirectly activated by the SendNode precursor node
  bool IsRecvNodeActivatedBySendNode(const NodePtr &send_node_ptr, const NodePtr &recv_node_ptr) const;

  Status RefreshContinuousEvents();
  Status InsertSyncEventNodes();
  Status ReorderEventNodes() const;
  Status SplitStreams(std::vector<std::set<int64_t>> &split_streams);
  bool NeedSpiltNewStream(int64_t stream_node_num, int64_t max_node_num_one_stream, const OpDescPtr &op_desc) const;

  Status UpdateActiveStreams(const std::vector<std::set<int64_t>> &splited_streams);
  void UpdateLabelStreams(const std::vector<std::set<int64_t>> &split_streams);
  Status InsertActiveNodesAfterSwitch(NodePtr &switch_node);
  Status InsertActiveNodesAfterSwitch(NodePtr &switch_nodes, std::vector<NodePtr> &switch_active_nodes);
  Status SetActiveStreamList(NodePtr &active_node, const std::string &active_label);
  Status AddActiveNodes(NodePtr &switch_node, const std::vector<std::string> &ori_active_label_list,
                        std::vector<std::string> &active_label_list, std::vector<NodePtr> &added_active_nodes);
  Status UpdateActiveStreamsForSubgraphs() const;
  Status SetActiveStreamsForLoop();
  Status CheckStreamActived() const;

  Status AddActiveEntryStream();
  Status CollectDeactiveStream(const OpDescPtr &op_desc, std::set<uint32_t> &deactive_streams) const;
  Status InsertActiveEntryStream(const std::vector<uint32_t> &active_streams, int64_t stream_id);

  Status AddEventId(const NodePtr &pre_node, const NodePtr &not_cur, const NodePtr &cur_node, bool not_use_cur);
  Status RefreshContinuousEvents();

  Status InsertSyncEventNodes();
  Status ReorderEventNodes() const;

  void DumpEvents();

  Status GetMaxStreamAndTask(bool huge_stream, uint32_t &max_stream_count, uint32_t &max_task_count);
  int64_t GetMaxNodeNumPerStream(const NodePtr &node, uint32_t max_node_num_one_stream);

  void AddSendEventId(const NodePtr &node, uint32_t event_id);
  void AddRecvEventId(const NodePtr &node, uint32_t event_id);
  void RmvSendEventId(const NodePtr &node, uint32_t event_id);
@@ -83,10 +89,11 @@ class StreamAllocator {
  void GetRecvEventIdList(const NodePtr &node, std::vector<uint32_t> &recv_list) const;
  NodePtr GetNodeFromSendEventId(uint32_t send_event_id) const;
  NodePtr GetNodeFromRecvEventId(uint32_t recv_event_id) const;
  Status AddEventId(const NodePtr &pre_node, const NodePtr &not_cur, const NodePtr &cur_node, bool not_use_cur);

  void DumpEvents();
  // Determine if the successor node of RecvNode is directly or indirectly activated by the SendNode precursor node
  bool IsRecvNodeActivatedBySendNode(const NodePtr &send_node_ptr, const NodePtr &recv_node_ptr) const;
  Status AddActiveNodes(NodePtr &switch_node, const std::vector<std::string> &ori_active_label_list,
                        std::vector<std::string> &active_label_list, std::vector<NodePtr> &added_active_nodes);
  Status SetActiveStreamList(NodePtr &active_node, const std::string &active_label);

  ComputeGraphPtr whole_graph_;
  const Graph2SubGraphInfoList &subgraphs_;
@@ -102,6 +109,9 @@ class StreamAllocator {
  std::set<int64_t> specific_activated_streams_;
  std::map<int64_t, std::set<NodePtr>> specific_activated_streams_nodes_map_;

  std::map<NodePtr, int64_t> node_split_stream_map_;
  std::map<ComputeGraphPtr, NodePtr> subgraph_first_active_node_map_;

  // send events corresponding to the node
  std::map<NodePtr, std::vector<uint32_t>> node_to_send_events_;

@@ -109,4 +119,4 @@ class StreamAllocator {
  std::map<NodePtr, std::vector<uint32_t>> node_to_recv_events_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_BUILD_STREAM_ALLOCATOR_H_
 #endif  // GE_GRAPH_BUILD_STREAM_ALLOCATOR_H_
--- a/src/ge/graph/build/stream_graph_optimizer.cc
+++ b/src/ge/graph/build/stream_graph_optimizer.cc
@@ -30,7 +30,7 @@ namespace ge {
 StreamGraphOptimizer::~StreamGraphOptimizer() {}

 void StreamGraphOptimizer::RefreshNodeId(const ComputeGraphPtr &comp_graph, Graph2SubGraphInfoList &subgraph_map) {
  size_t node_size = comp_graph->GetDirectNodesSize();
  size_t node_size = comp_graph->GetAllNodesSize();
  GELOGI("Refresh placeholder and end nodeId start from node num: %zu", node_size);
  for (const auto &subgraph_pair : subgraph_map) {
    for (const auto &subgraph_info : subgraph_pair.second) {
--- a/src/ge/graph/build/task_generator.cc
+++ b/src/ge/graph/build/task_generator.cc
@@ -17,9 +17,9 @@
 #include "graph/build/task_generator.h"
 #include <string>
 #include <utility>
 #include "common/profiling/profiling_manager.h"
 #include "common/types.h"
 #include "common/util.h"
 #include "common/profiling/profiling_manager.h"
 #include "framework/common/debug/ge_log.h"

 #include "graph/debug/ge_attr_define.h"
@@ -73,12 +73,24 @@ Status TaskGenerator::GetTaskInfo(Model &model, ComputeGraphPtr &graph, uint64_t

  std::vector<TaskDef> task_def_list;
  std::map<uint32_t, string> op_name_map;
  GE_DUMP(graph, "GenerateTaskBefore");
  bool is_unknown_shape = false;
  NodePtr parent_node = graph->GetParentNode();
  if (parent_node != nullptr) {
    auto op_desc = parent_node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    (void)AttrUtils::GetBool(op_desc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_unknown_shape);
  }
  Status ret = SUCCESS;
  if (is_unknown_shape) {
    GELOGI("Beign to generate unknown shape task.");
    ret = GenerateUnknownShapeTask(run_context, graph, task_def_list, op_name_map);
  } else {
    GELOGI("Beign to generate known shape task.");
    ret = GenerateTask(run_context, graph, task_def_list, op_name_map);
  }
  GE_DUMP(graph, "GenerateTaskAfter");

  GraphUtils::DumpGEGraph(graph, "GenerateTaskBefore");
  GraphUtils::DumpGEGraphToOnnx(*graph, "GenerateTaskBefore");
  Status ret = GenerateTask(run_context, graph, task_def_list, op_name_map);
  GraphUtils::DumpGEGraph(graph, "GenerateTaskAfter");
  GraphUtils::DumpGEGraphToOnnx(*graph, "GenerateTaskAfter");
  if (ret != SUCCESS) {
    GELOGE(ret, "GenerateTask failed. session_id=%lu", session_id);
    return ret;
@@ -251,8 +263,9 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
  GE_TIMESTAMP_CALLNUM_START(GenerateTask);
  // map store fusion nodes
  map<int64_t, std::vector<NodePtr>> fusion_nodes;
  const char *buffer_optimize_on = std::getenv("BUFFER_OPTIMIZE_ON");
  if (buffer_optimize_on != nullptr) {
  string buffer_optimize = "off_optimize";
  (void)ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
  if (buffer_optimize != "off_optimize") {
    GE_CHK_STATUS_RET(SaveFusionNodes(fusion_nodes, graph));
  }
  std::unordered_set<Node *> fusion_nodes_seen;
@@ -342,10 +355,125 @@ Status TaskGenerator::GenerateTask(RunContext &run_context, ComputeGraphPtr &gra
      task_def_ptr->set_ops_kernel_store_ptr(reinterpret_cast<uintptr_t>(ops_kernel_info_store_ptr));
    }

    GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task finished, generate %lu task(s).",
    GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task finished, generate %zu task(s).",
           op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id,
           task_list_size_after - task_list_size_before);
  }
  GE_TIMESTAMP_CALLNUM_END(GenerateTask, "GraphBuild::GenerateTask");
  return SUCCESS;
 }

 Status TaskGenerator::GenerateUnknownShapeTask(RunContext &run_context, ComputeGraphPtr &graph,
                                               vector<domi::TaskDef> &task_def_list,
                                               map<uint32_t, string> &op_name_map) {
  std::shared_ptr<GELib> ge_lib = GELib::GetInstance();
  if ((ge_lib == nullptr) || !ge_lib->InitFlag()) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "GenerateTask failed.");
    return GE_CLI_GE_NOT_INITIALIZED;
  }
  GE_CHK_STATUS_RET(MarkNodeAndSetIndex(graph), "MarkNodeAndSetIndex failed.");
  ProfilingPoint profiling_point;
  vector<uint32_t> all_reduce_nodes;
  GE_CHK_STATUS_RET(FindProfilingTaskIndex(graph, profiling_point, all_reduce_nodes));

  const OpsKernelManager &ops_kernel_manager = ge_lib->OpsKernelManagerObj();

  GE_TIMESTAMP_CALLNUM_START(GenerateTask);
  // map store fusion nodes
  map<int64_t, std::vector<NodePtr>> fusion_nodes;
  string buffer_optimize = "off_optimize";
  (void)ge::GetContext().GetOption(BUFFER_OPTIMIZE, buffer_optimize);
  if (buffer_optimize != "off_optimize") {
    GE_CHK_STATUS_RET(SaveFusionNodes(fusion_nodes, graph));
  }
  std::unordered_set<Node *> fusion_nodes_seen;
  int64_t group_key;
  uint32_t node_index = 0;
  rtStream_t stream = nullptr;
  GE_CHK_RT_RET(rtStreamCreate(&stream, 0));
  run_context.stream = stream;
  GE_CHK_RT_RET(rtModelBindStream(run_context.model, stream, 0));
  for (auto &node : graph->GetAllNodes()) {
    OpDescPtr op_desc = node->GetOpDesc();
    GE_CHECK_NOTNULL(op_desc);
    node_index++;
    string name = node->GetName();
    string type = node->GetType();
    bool attr_notask = false;
    bool get_attr_notask_flag = ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_notask);
    GE_IF_BOOL_EXEC(get_attr_notask_flag && attr_notask,
                    GELOGI("Node[name:%s, type:%s] does not need to generate task.", name.c_str(), type.c_str());
                    continue);

    GE_CHK_STATUS_RET(UpdateOpIsVarAttr(op_desc, graph->GetSessionID()));
    string op_kernel_lib_name = op_desc->GetOpKernelLibName();
    // For fusion ddb pass, task def must be continuous.
    // Part2: Call
    auto fusion_task_info =
      FusionTaskInfo{run_context,        graph,         node,        op_desc,         node_index,      ge_lib,
                     ops_kernel_manager, task_def_list, op_name_map, profiling_point, all_reduce_nodes};
    GE_CHK_STATUS_RET(GenerateTaskForFusionNode(fusion_task_info, fusion_nodes, fusion_nodes_seen),
                      "Call GenerateTaskForFusionNode node:%s(%s) failed", name.c_str(), type.c_str());
    // continue directly
    if (ge::AttrUtils::GetInt(op_desc, ATTR_NAME_FUSION_GROUP_KEY, group_key)) {
      GELOGI("Fusion node[name:%s, type:%s] do not need generate task again.", name.c_str(), type.c_str());
      continue;
    }
    if (op_kernel_lib_name.empty()) {
      GELOGI("Node[name:%s, type:%s] does not need to generate task.", name.c_str(), type.c_str());
      continue;
    }
    OpsKernelInfoStorePtr kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name);
    if (kernel_info_store == nullptr) {
      GELOGE(INTERNAL_ERROR, "No ops kernel store found. node:%s(%s), op_kernel_lib_name=%s.", name.c_str(),
             type.c_str(), op_kernel_lib_name.c_str());
      return INTERNAL_ERROR;
    }
    GE_CHK_STATUS_RET(UpdateAnchorStatus(node), "Call UpdateAnchorStatus node:%s(%s) failed", name.c_str(),
                      type.c_str());
    int64_t op_id = op_desc->GetId();
    int64_t stream_id = op_desc->GetStreamId();
    // Profiling task
    size_t task_list_size_before = task_def_list.size();
    GE_CHK_STATUS_RET(InsertProfilingTaskBefore(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list));

    GELOGI("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task.", op_kernel_lib_name.c_str(),
           name.c_str(), type.c_str(), op_id, stream_id);
    GE_TIMESTAMP_RESTART(GenerateTask);
    auto ret = kernel_info_store->GenerateTask(*node, run_context, task_def_list);
    GE_TIMESTAMP_ADD(GenerateTask);
    if (ret != SUCCESS) {
      GELOGE(ret, "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task failed.",
             op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id);
      return ret;
    }
    // Profiling task
    GE_CHK_STATUS_RET(InsertProfilingTaskAfter(op_desc, profiling_point, all_reduce_nodes, node_index, task_def_list));
    size_t task_list_size_after = task_def_list.size();
    // If tasks is reduced
    if (task_list_size_after < task_list_size_before) {
      GELOGE(FAILED, "Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task. but task num from %zu to %zu.",
             op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id, task_list_size_before,
             task_list_size_after);
      return FAILED;
    }

    // Reset stream id to ge stream id, as graph load must use ge stream to reassign stream
    void *ops_kernel_info_store_ptr = kernel_info_store.get();
    for (size_t idx = task_list_size_before; idx < task_list_size_after; ++idx) {
      op_name_map[idx] = name;
      // Set opsKernelInfoStorePtr and op_index, the two fields be use in DistributeTask and InitTaskInfo
      TaskDef *task_def_ptr = &task_def_list[idx];
      GE_CHECK_NOTNULL(task_def_ptr);
      task_def_ptr->set_ops_kernel_store_ptr(reinterpret_cast<uintptr_t>(ops_kernel_info_store_ptr));
    }

    GELOGD("Call %s to generate node[name:%s(%s), id:%ld, stream_id:%ld] task finished, generate %zu task(s).",
           op_kernel_lib_name.c_str(), name.c_str(), type.c_str(), op_id, stream_id,
           task_list_size_after - task_list_size_before);
  }
  GE_CHK_RT(rtModelUnbindStream(run_context.model, stream));
  GE_CHK_RT(rtStreamDestroy(stream));
  GE_TIMESTAMP_CALLNUM_END(GenerateTask, "GraphBuild::GenerateTask");
  return SUCCESS;
 }
@@ -381,6 +509,11 @@ Status TaskGenerator::GenerateTaskForFusionNode(FusionTaskInfo &fusion_task_info
               fusion_node_type.c_str());
        continue;
      }
      bool attr_notask = false;
      GE_IF_BOOL_EXEC(ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_notask) && attr_notask,
                      GELOGI("Fusion: fusion_node[name:%s, type:%s] does not need to generate task.",
                             fusion_node_name.c_str(), fusion_node_type.c_str());
                      continue);

      size_t task_list_size_before = task_def_list.size();
      OpsKernelInfoStorePtr kernel_info_store = ops_kernel_manager.GetOpsKernelInfoStore(op_kernel_lib_name);
@@ -528,6 +661,10 @@ Status TaskGenerator::MarkFirstAndLastOps(const vector<OpDescPtr> &ops, bool is_
  vector<vector<OpDescPtr>> continuous_op_lists(1);
  const set<string> label_op_types({LABELSET, LABELGOTO, LABELGOTOEX, LABELSWITCH, LABELSWITCHBYINDEX});
  for (auto &op_desc : ops) {
    bool attr_notask = false;
    if (ge::AttrUtils::GetBool(op_desc, ATTR_NAME_NOTASK, attr_notask) && attr_notask) {
      continue;
    }
    string op_type = op_desc->GetType();
    if (!is_single_stream && (!op_desc->GetSubgraphInstanceNames().empty() || label_op_types.count(op_type) != 0)) {
      continuous_op_lists.emplace_back(vector<OpDescPtr>());
@@ -629,7 +766,7 @@ Status TaskGenerator::AutoFindBpOpIndex(const ComputeGraphPtr &graph, ProfilingP
      continue;
    }

    if (op_desc->GetType() == HCOMALLREDUCE) {
    if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) {
      bp_node = node;
      all_reduce_nodes.emplace_back(current_idx);
      GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
@@ -721,7 +858,7 @@ Status TaskGenerator::FindBpOfEnv(const ComputeGraphPtr &graph, const std::strin
      iter_end = current_idx;
      GELOGI("Iter end name %s, idx %u", op_desc->GetName().c_str(), iter_end);
    }
    if (op_desc->GetType() == HCOMALLREDUCE) {
    if (op_desc->GetType() == HCOMALLREDUCE || op_desc->GetType() == HVDCALLBACKALLREDUCE) {
      all_reduce_nodes.emplace_back(current_idx);
      GELOGI("Allreduce name %s, idx %u", op_desc->GetName().c_str(), current_idx);
    }
--- a/src/ge/graph/build/task_generator.h
+++ b/src/ge/graph/build/task_generator.h
@@ -82,7 +82,7 @@ class TaskGenerator {
  Status UpdateOpIsVarAttr(const OpDescPtr &op_desc, uint64_t session_id);

  ///
  /// call engine to generate task.
  /// call engine to generate known shape task.
  /// @param run_context run context
  /// @param graph compute graph
  /// @param task_def_list task def list generate by engine
@@ -94,6 +94,18 @@ class TaskGenerator {
                      std::map<uint32_t, string> &op_name_map);

  ///
  /// call engine to generate unknown shape task.
  /// @param run_context run context
  /// @param graph compute graph
  /// @param task_def_list task def list generate by engine
  /// @param op_name_map relation of task index and op
  /// @return SUCCESS:seccess
  /// Other: failed
  ///
  Status GenerateUnknownShapeTask(RunContext &run_context, ComputeGraphPtr &graph,
                                  std::vector<domi::TaskDef> &task_def_list, std::map<uint32_t, string> &op_name_map);

  ///
  /// AddModelTaskToModel
  /// @param model_task_def model task
  /// @param model_def model
--- a/src/ge/graph/execute/graph_execute.cc
+++ b/src/ge/graph/execute/graph_execute.cc
@@ -258,7 +258,7 @@ Status GraphExecutor::SyncExecuteModel(uint32_t model_id, const std::vector<GeTe

    // Run graph return
    uint32_t result_code = graph_run_listener_->GetResultCode();
    if (result_code != SUCCESS) {
    if (result_code != SUCCESS && result_code != END_OF_SEQUENCE) {
      GELOGE(GE_GRAPH_EXECUTE_FAILED, "[GraphExecutor] execute model failed, ret=%u, modelId=%u.", result_code,
             model_id);
      return GE_GRAPH_EXECUTE_FAILED;
@@ -319,7 +319,7 @@ Status GraphExecutor::FreeExecuteMemory() {
  return SUCCESS;
 }

 Status GraphExecutor::ExecuteGraph(GraphId graph_id, const GeModelPtr &ge_model,
 Status GraphExecutor::ExecuteGraph(GraphId graph_id, const GeRootModelPtr &ge_root_model,
                                   const std::vector<GeTensor> &input_tensor, std::vector<GeTensor> &output_tensor) {
  if (graph_id != last_graph_id_) {
    auto ret = FreeExecuteMemory();
@@ -333,8 +333,8 @@ Status GraphExecutor::ExecuteGraph(GraphId graph_id, const GeModelPtr &ge_model,
    GELOGE(GE_GRAPH_EXECUTE_NOT_INIT, "[GraphExecutor] AI Core Engine without calling SetCondition!");
    return GE_GRAPH_EXECUTE_NOT_INIT;
  }
  GE_CHECK_NOTNULL_EXEC(ge_model, return FAILED);
  Status ret = SyncExecuteModel(ge_model->GetModelId(), input_tensor, output_tensor);
  GE_CHECK_NOTNULL_EXEC(ge_root_model, return FAILED);
  Status ret = SyncExecuteModel(ge_root_model->GetModelId(), input_tensor, output_tensor);
  if (ret != SUCCESS) {
    GELOGE(GE_GRAPH_SYNC_MODEL_FAILED, "[GraphExecutor] SyncExecuteModel Error!");
    return GE_GRAPH_SYNC_MODEL_FAILED;
@@ -343,7 +343,7 @@ Status GraphExecutor::ExecuteGraph(GraphId graph_id, const GeModelPtr &ge_model,
  return SUCCESS;
 }

 Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeModelPtr &ge_model,
 Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr &ge_root_model,
                                        const std::vector<InputTensorInfo> &input_tensor) {
  GELOGI("[GraphExecutor] Start to async execute graph, graph_id=%u", graph_id);
  if (graph_id != last_graph_id_) {
@@ -353,8 +353,8 @@ Status GraphExecutor::ExecuteGraphAsync(GraphId graph_id, const GeModelPtr &ge_m
    }
  }
  last_graph_id_ = graph_id;
  GE_CHECK_NOTNULL_EXEC(ge_model, return FAILED);
  Status ret = AsyncExecuteModel(ge_model->GetModelId(), input_tensor);
  GE_CHECK_NOTNULL_EXEC(ge_root_model, return FAILED);
  Status ret = AsyncExecuteModel(ge_root_model->GetModelId(), input_tensor);
  if (ret != SUCCESS) {
    GELOGE(GE_GRAPH_SYNC_MODEL_FAILED, "[GraphExecutor] AsyncExecuteModel Error!");
    return GE_GRAPH_SYNC_MODEL_FAILED;
--- a/src/ge/graph/execute/graph_execute.h
+++ b/src/ge/graph/execute/graph_execute.h
@@ -46,11 +46,11 @@ class GraphExecutor {

  virtual ~GraphExecutor();

  Status ExecuteGraph(GraphId graph_id, const GeModelPtr &ge_model, const std::vector<GeTensor> &input_tensor,
  Status ExecuteGraph(GraphId graph_id, const GeRootModelPtr &ge_root_model, const std::vector<GeTensor> &input_tensor,
                      std::vector<GeTensor> &output_tensor);

  Status ExecuteGraphAsync(GraphId graph_id, const GeModelPtr &ge_model,
                           const std::vector<InputTensorInfo> &input_tensor);
  ge::Status ExecuteGraphAsync(GraphId graph_id, const GeRootModelPtr &ge_root_model,
                               const std::vector<InputTensorInfo> &input_tensor);

  Status SetCondition(std::mutex *mutex, std::condition_variable *cond, std::shared_ptr<GraphModelListener> listener);

--- a/src/ge/graph/label/label_maker.cc
+++ b/src/ge/graph/label/label_maker.cc
@@ -96,42 +96,6 @@ void LabelMaker::SetStreamIdOwner(const ComputeGraphPtr &graph, const OpDescPtr

 /**
 * @ingroup ge
 * @brief Link Node to Graph head.
 * @param [in] graph: graph for add node.
 * @param [in] lb_node: Node for set link to head.
 * @return: SUCCESS / FAILED
 */
 Status LabelMaker::AddCtrlLink2Data(const ComputeGraphPtr &graph, const NodePtr &node) {
  GE_CHECK_NOTNULL(graph);
  GE_CHECK_NOTNULL(node);

  std::set<NodePtr> linked_nodes;
  for (const NodePtr &n : graph->GetDirectNode()) {
    GE_CHECK_NOTNULL(n);
    if (n->GetType() != DATA) {
      continue;
    }

    // Link control edge to graph head.
    for (const NodePtr &out_node : n->GetOutAllNodes()) {
      if (linked_nodes.count(out_node) > 0) {
        continue;
      }

      (void)linked_nodes.insert(out_node);
      if (GraphUtils::AddEdge(node->GetOutControlAnchor(), out_node->GetInControlAnchor()) != SUCCESS) {
        GELOGE(INTERNAL_ERROR, "Add ctrl edge from %s to %s failed.", node->GetName().c_str(),
               out_node->GetName().c_str());
        return FAILED;
      }
    }
  }

  return SUCCESS;
 }

 /**
 * @ingroup ge
 * @brief Add StreamActive node at graph front.
 * @param [in] graph: graph for add node.
 * @param [in] name: stream active node name.
@@ -154,15 +118,10 @@ NodePtr LabelMaker::AddStreamActive(const ComputeGraphPtr &graph, const std::str
  vector<uint32_t> active_streams;
  (void)AttrUtils::SetStr(op_desc, ATTR_NAME_SWITCH_BRANCH_NODE_LABEL, op_desc->GetName());
  (void)AttrUtils::SetListInt(op_desc, ATTR_NAME_ACTIVE_STREAM_LIST, active_streams);
  (void)AttrUtils::SetBool(op_desc, ATTR_NAME_SUBGRAPH_FIRST_ACTIVE, true);
  NodePtr stream_active = graph->AddNodeFront(op_desc);
  GE_CHECK_NOTNULL_EXEC(stream_active, return nullptr);

  // Link control edge to graph head.
  if (AddCtrlLink2Data(graph, stream_active) != SUCCESS) {
    GELOGE(INTERNAL_ERROR, "Add ctrl edge for graph %s failed.", graph->GetName().c_str());
    return nullptr;
  }

  return stream_active;
 }

@@ -230,6 +189,7 @@ NodePtr LabelMaker::AddLabelSetLeave(const ComputeGraphPtr &graph, const std::st

  GELOGI("LabelSet: Create node %s.", op_desc->GetName().c_str());
  (void)AttrUtils::SetInt(op_desc, ATTR_NAME_LABEL_SWITCH_INDEX, index);
  (void)AttrUtils::SetBool(op_desc, ATTR_NAME_SUBGRAPH_END_NODE, true);
  NodePtr label_set = graph->AddNode(op_desc);
  GE_CHECK_NOTNULL_EXEC(label_set, return nullptr);

--- a/src/ge/graph/label/label_maker.h
+++ b/src/ge/graph/label/label_maker.h
@@ -60,7 +60,6 @@ class LabelMaker {
  ComputeGraphPtr parent_graph_;

 private:
  Status AddCtrlLink2Data(const ComputeGraphPtr &graph, const NodePtr &node);
  void SetStreamIdEnter(const ComputeGraphPtr &graph, const OpDescPtr &op_desc);
  void SetStreamIdLeave(const ComputeGraphPtr &graph, const OpDescPtr &op_desc);
  void SetStreamIdOwner(const ComputeGraphPtr &graph, const OpDescPtr &op_desc);
--- a/src/ge/graph/label/partitioned_call_label_maker.cc
+++ b/src/ge/graph/label/partitioned_call_label_maker.cc
@@ -50,6 +50,21 @@ Status PartitionedCallLabelMaker::Run(uint32_t &label_index) {
    return FAILED;
  }

  const std::string stream_active_name = parent_node_->GetName() + "/StreamActive";  // rtStreamActive
  NodePtr stream_active = AddStreamActive(sub_graph, stream_active_name);
  if (stream_active == nullptr) {
    GELOGE(INTERNAL_ERROR, "Subgraph: %s add stream active node failed.", sub_graph->GetName().c_str());
    return FAILED;
  }

  for (auto &node : sub_graph->GetDirectNode()) {
    if (node->GetType() == NETOUTPUT) {
      auto op_desc = node->GetOpDesc();
      GE_CHECK_NOTNULL(op_desc);
      (void)AttrUtils::SetBool(op_desc, ATTR_NAME_SUBGRAPH_END_NODE, true);
    }
  }

  return SUCCESS;
 }

--- a/src/ge/graph/label/partitioned_call_label_maker.h
+++ b/src/ge/graph/label/partitioned_call_label_maker.h
@@ -29,6 +29,8 @@
                                                            +---------------+
    +---------------+
    |     Node      |                                       +---------------+
    +---------------+                                       | StreamActive  |
    |     Node      |                                       +---------------+
    +---------------+                                       |       f       |
    |     Node      |                                       +---------------+
    +---------------+                                       |       u       |
--- a/src/ge/graph/load/graph_loader.cc
+++ b/src/ge/graph/load/graph_loader.cc
@@ -53,7 +53,7 @@ Status GraphLoader::UnloadModel(uint32_t model_id) {
  return SUCCESS;
 }

 Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge::GeModel> &ge_model_ptr,
 Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge::GeRootModel> &ge_root_model_ptr,
                                    const std::shared_ptr<ModelListener> &listener) {
  GELOGI("Load model online begin.");
  rtError_t rt_ret = rtSetDevice(GetContext().DeviceId());
@@ -62,15 +62,15 @@ Status GraphLoader::LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge
    CsaInteract::GetInstance().WriteErrorCode(rt_ret, ERROR_MODULE_RUNTIME, JOBSUBSTATE_GRAPH_LOAD);
    return RT_FAILED;
  }
  if (ge_model_ptr == nullptr) {
  if (ge_root_model_ptr == nullptr) {
    GELOGE(GE_GRAPH_PARAM_NULLPTR, "[LoadGraph] GE load graph model_ptr is nullptr.");
    return GE_GRAPH_PARAM_NULLPTR;
  }
  model_id = ge_model_ptr->GetModelId();
  model_id = ge_root_model_ptr->GetModelId();

  auto model_manager = ModelManager::GetInstance();
  GE_CHECK_NOTNULL(model_manager);
  Status ret = model_manager->LoadModelOnline(model_id, ge_model_ptr, listener);
  Status ret = model_manager->LoadModelOnline(model_id, ge_root_model_ptr, listener);
  if (ret != SUCCESS) {
    GELOGE(ret, "LoadModel: Load failed. ret = %u", ret);
    CsaInteract::GetInstance().WriteErrorCode(ret, ERROR_MODULE_FMK, JOBSUBSTATE_GRAPH_LOAD);
--- a/src/ge/graph/load/graph_loader.h
+++ b/src/ge/graph/load/graph_loader.h
@@ -71,7 +71,7 @@ class GraphLoader {

  static Status DestroyAicpuSessionForInfer(uint32_t model_id);

  static Status LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge::GeModel> &model,
  static Status LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge::GeRootModel> &ge_root_model,
                                const std::shared_ptr<ModelListener> &listener);
 };
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/data_dumper.cc
+++ b/src/ge/graph/load/new_model_manager/data_dumper.cc
@@ -15,9 +15,12 @@
 */

 #include "graph/load/new_model_manager/data_dumper.h"

 #include <ctime>
 #include <map>
 #include <utility>
 #include <vector>

 #include "common/properties_manager.h"
 #include "framework/common/debug/ge_log.h"
 #include "framework/common/util.h"
@@ -32,6 +35,7 @@
 namespace {
 const uint32_t kAicpuLoadFlag = 1;
 const uint32_t kAicpuUnloadFlag = 0;
 const uint32_t kTimeBufferLen = 80;
 const char *const kDumpOutput = "output";
 const char *const kDumpInput = "input";
 const char *const kDumpAll = "all";
@@ -156,10 +160,8 @@ void DataDumper::SaveDumpTask(uint32_t task_id, uint32_t stream_id, const std::s
      return;
    }

    uintptr_t data_addr = args - sizeof(void *) * op_desc->GetInputOffset().size() +
                          sizeof(void *) * static_cast<uint32_t>(inner_input_mapping.input_anchor_index);
    GELOGI("Save input dump task %s, id: %u.", data_op->GetName().c_str(), task_id);
    op_list_.push_back({task_id, stream_id, data_op, data_addr, false, inner_input_mapping.input_anchor_index,
    op_list_.push_back({task_id, stream_id, data_op, args, false, inner_input_mapping.input_anchor_index,
                        inner_input_mapping.output_anchor_index, input_tensor->GetShape().GetDims()});
  }
 }
@@ -188,11 +190,24 @@ static void SetOpMappingLoopAddr(uintptr_t step_id, uintptr_t loop_per_iter, uin
  }
 }

 static std::string GetCurrentTime() {
  std::time_t now = std::time(nullptr);
  std::tm *ptm = std::localtime(&now);
  if (ptm == nullptr) {
    return "";
  }
  char buffer[kTimeBufferLen] = {0};
  // format: 20171122042550
  std::strftime(buffer, kTimeBufferLen, "%Y%m%d%H%M%S", ptm);
  return std::string(buffer);
 }

 Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump::Task &task) {
  GELOGI("Start dump output");
  if (inner_dump_info.is_task) {
    // tbe or aicpu op
    const auto &output_descs = inner_dump_info.op->GetAllOutputsDesc();
    const auto input_size = inner_dump_info.op->GetAllInputsDesc().size();
    const std::vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(runtime_param_, inner_dump_info.op, false);
    if (output_descs.size() != output_addrs.size()) {
      GELOGE(PARAM_INVALID, "Invalid output desc addrs size %zu, op %s has %zu output desc.", output_addrs.size(),
@@ -217,8 +232,7 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump:
      output.set_original_output_index(origin_output_index);
      output.set_original_output_format(static_cast<int32_t>(output_descs.at(i).GetOriginFormat()));
      output.set_original_output_data_type(static_cast<int32_t>(output_descs.at(i).GetOriginDataType()));
      // due to lhisi virtual addr bug, cannot use args now
      output.set_address(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(output_addrs[i])));
      output.set_address(static_cast<uint64_t>(inner_dump_info.args + (i + input_size) * sizeof(void *)));

      task.mutable_output()->Add(std::move(output));
    }
@@ -255,8 +269,8 @@ Status DataDumper::DumpOutput(const InnerDumpInfo &inner_dump_info, aicpu::dump:
    GELOGE(FAILED, "Index is out of range.");
    return FAILED;
  }
  output.set_address(
    static_cast<uint64_t>(reinterpret_cast<uintptr_t>(output_addrs[inner_dump_info.output_anchor_index])));
  auto data_addr = inner_dump_info.args + sizeof(void *) * static_cast<uint32_t>(inner_dump_info.input_anchor_index);
  output.set_address(static_cast<uint64_t>(data_addr));

  task.mutable_output()->Add(std::move(output));

@@ -282,7 +296,7 @@ Status DataDumper::DumpInput(const InnerDumpInfo &inner_dump_info, aicpu::dump::
      input.mutable_shape()->add_dim(dim);
    }

    input.set_address(static_cast<uint64_t>(reinterpret_cast<uintptr_t>(input_addrs[i])));
    input.set_address(static_cast<uint64_t>(inner_dump_info.args + sizeof(void *) * i));
    task.mutable_input()->Add(std::move(input));
  }
  return SUCCESS;
@@ -370,7 +384,10 @@ Status DataDumper::LoadDumpInfo() {
  }

  aicpu::dump::OpMappingInfo op_mapping_info;
  op_mapping_info.set_dump_path(PropertiesManager::Instance().GetDumpOutputPath() + std::to_string(device_id_) + "/");
  std::string time_now = GetCurrentTime();
  GELOGI("Time is %s now", time_now.c_str());
  op_mapping_info.set_dump_path(PropertiesManager::Instance().GetDumpOutputPath() + time_now + "/" +
                                std::to_string(device_id_) + "/");
  op_mapping_info.set_model_name(model_name_);
  op_mapping_info.set_model_id(model_id_);
  op_mapping_info.set_flag(kAicpuLoadFlag);
--- a/src/ge/graph/load/new_model_manager/davinci_model.cc
+++ b/src/ge/graph/load/new_model_manager/davinci_model.cc
@@ -45,6 +45,7 @@
 #include "graph/load/output/output.h"
 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_var_manager.h"
 #include "graph/manager/trans_var_data_utils.h"
 #include "graph/manager/util/debug.h"
 #include "graph/model_serialize.h"
 #include "graph/node.h"
@@ -75,6 +76,7 @@
 namespace ge {
 namespace {
 const uint32_t kDataIndex = 0;
 const uint32_t kOutputNum = 1;
 const uint32_t kTrueBranchStreamNum = 1;
 const uint32_t kThreadNum = 16;
 const uint32_t kAddrLen = sizeof(void *);
@@ -83,275 +85,6 @@ const int kBytes = 8;
 const uint32_t kDataMemAlignSizeCompare = 64;
 const char *const kDefaultBatchLable = "Batch_default";

 class RtContextSwitchGuard {
 public:
  RtContextSwitchGuard(rtCtxMode_t mode, uint32_t device_id) : last_(nullptr), current_(nullptr) {
    auto ret = rtCtxGetCurrent(&last_);
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret);
      return;
    }

    ret = rtCtxCreate(&current_, mode, static_cast<int32_t>(device_id));
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret);
      return;
    }

    ret = rtCtxSetCurrent(current_);
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id);
      return;
    }
    GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_);
  }

  ~RtContextSwitchGuard() {
    if (current_ != nullptr) {
      auto ret = rtCtxDestroy(current_);
      GELOGD("Destory current context %p result %d", current_, ret);
    }
    if (last_ != nullptr) {
      auto ret = rtCtxSetCurrent(last_);
      GELOGD("Recovery last context %p result %d.", last_, ret);
    }
  }

 private:
  rtContext_t last_;
  rtContext_t current_;
 };

 int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) {
  int64_t var_size = GetSizeByDataType(desc.GetDataType());
  if (var_size <= 0) {
    GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s",
           TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str());
    return -1;
  }
  auto shape = desc.GetShape();
  auto dim_num = shape.GetDimNum();
  for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) {
    var_size *= shape.GetDim(dim_index);
  }
  return var_size;
 }

 Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_ptr<uint8_t[]> &var_data,
                         const GeTensorDesc &input_desc) {
  uint8_t *var_logic = nullptr;
  GE_CHECK_NOTNULL(var);
  auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic);
  if (ret != SUCCESS) {
    GELOGE(INTERNAL_ERROR,
           "Failed to copy var %s from device, can not find it"
           " from var manager %u",
           var->GetName().c_str(), ret);
    return INTERNAL_ERROR;
  }

  uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM);
  if (var_addr == nullptr) {
    GELOGE(INTERNAL_ERROR,
           "Failed to copy var %s from device, cant not get "
           "var addr from logic addr %p",
           var->GetName().c_str(), var_logic);
    return INTERNAL_ERROR;
  }

  int64_t var_size_bytes = CalcVarSizeInBytes(input_desc);
  if (var_size_bytes <= 0) {
    return INTERNAL_ERROR;
  }

  std::unique_ptr<uint8_t[]> var_host(new (std::nothrow) uint8_t[var_size_bytes]);
  if (var_host == nullptr) {
    GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes);
    return OUT_OF_MEMORY;
  }

  ret = rtMemcpy(reinterpret_cast<void *>(var_host.get()), var_size_bytes, reinterpret_cast<void *>(var_addr),
                 var_size_bytes, RT_MEMCPY_DEVICE_TO_HOST);
  if (ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED,
           "Failed to copy var memory from device, var %s, size %ld,"
           " rt-error-code %u",
           var->GetName().c_str(), var_size_bytes, ret);
    return RT_FAILED;
  }

  GELOGD("Copy var %s from device to host, size %ld", var->GetName().c_str(), var_size_bytes);
  var_data.swap(var_host);

  GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr);

  return SUCCESS;
 }

 Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_result, void *var_addr) {
  GELOGD("Copy var %s from host to device, size %zu", var->GetName().c_str(), trans_result.length);
  auto ret = rtMemcpy(var_addr, trans_result.length, reinterpret_cast<void *>(trans_result.data.get()),
                      trans_result.length, RT_MEMCPY_HOST_TO_DEVICE);
  if (ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length);
    return RT_FAILED;
  }
  return SUCCESS;
 }

 Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats::TransResult &result) {
  formats::TransResult result_last_time{};
  bool use_init_data = true;
  for (const auto &trans_info : trans_road) {
    if (trans_info.node_type == RESHAPE || trans_info.node_type == REFORMAT) {
      GELOGD("Skip to trans variable data on the reshape/reformat node");
      continue;
    }
    uint8_t *src_data = nullptr;
    if (use_init_data) {
      src_data = var_data;
      use_init_data = false;
    } else {
      src_data = result_last_time.data.get();
    }

    formats::TransResult tmp_result{};
    if (trans_info.node_type == TRANSDATA) {
      auto src_format = trans_info.input.GetFormat();
      auto src_shape = trans_info.input.GetShape().GetDims();
      auto dst_format = trans_info.output.GetFormat();
      auto dst_shape = trans_info.output.GetShape().GetDims();
      auto data_type = trans_info.input.GetDataType();
      GELOGD("Trans format from %s to %s, shape %s to %s, data-type %s",
             TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(),
             formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(),
             TypeUtils::DataTypeToSerialString(data_type).c_str());
      auto ret = formats::TransFormat({src_data, src_format, dst_format, src_shape, dst_shape, data_type}, tmp_result);
      if (ret != SUCCESS) {
        GELOGE(INTERNAL_ERROR,
               "Failed to trans format from %s to %s, shape %s to %s, "
               "data type %s error code %u",
               TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(),
               formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(),
               TypeUtils::DataTypeToSerialString(data_type).c_str(), ret);
        return ret;
      }
    } else if (trans_info.node_type == CAST) {
      auto input_shape = trans_info.input.GetShape();
      auto src_data_size = input_shape.GetShapeSize() == 0 ? 1 : input_shape.GetShapeSize();
      auto src_data_type = trans_info.input.GetDataType();
      auto dst_data_type = trans_info.output.GetDataType();
      GELOGD("Trans data type from %s to %s, input shape %s, data size %ld",
             TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
             TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
             src_data_size);
      auto ret = formats::TransDataType({src_data, static_cast<size_t>(src_data_size), src_data_type, dst_data_type},
                                        tmp_result);
      if (ret != SUCCESS) {
        GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u",
               TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
               TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
               src_data_size, ret);
        return ret;
      }
    } else {
      GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported",
             trans_info.node_type.c_str());
      return UNSUPPORTED;
    }
    result_last_time = tmp_result;
  }

  result = result_last_time;
  return SUCCESS;
 }

 /// re-alloc var memory on device using var-manager
 /// free origin var memory(var manager does not support now)
 /// @param session_id
 /// @param var
 /// @param var_size_bytes
 /// @param var_device
 /// @return
 Status ReAssignVarAddr(uint64_t session_id, const std::string &var_name, const GeTensorDesc &tensor_desc,
                       void **var_device) {
  uint8_t *var_logic = nullptr;
  Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic);
  if (ret != SUCCESS) {
    GELOGE(INTERNAL_ERROR,
           "Failed to get var %s device addr, can not find it"
           " from var manager %u",
           var_name.c_str(), ret);
    return INTERNAL_ERROR;
  }

  uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM);
  if (var_addr == nullptr) {
    GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str());
    return INTERNAL_ERROR;
  }
  *var_device = var_addr;

  GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr);

  return SUCCESS;
 }

 Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t session_id) {
  // do not need to do anything if only all reshape/reformat node on the trans_road
  GE_CHECK_NOTNULL(var);
  bool need_trans = false;
  for (auto &road : trans_road) {
    if (road.node_type != RESHAPE && road.node_type != REFORMAT) {
      need_trans = true;
      break;
    }
  }
  if (!need_trans) {
    return SUCCESS;
  }

  // Sync var data from device
  std::unique_ptr<uint8_t[]> var_data;
  if (trans_road.size() == 0) {
    GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty.");
    return INTERNAL_ERROR;
  }
  const GeTensorDesc &input_desc = trans_road.begin()->input;
  auto ret = CopyVarFromDevice(session_id, var, var_data, input_desc);
  if (ret != SUCCESS) {
    return ret;
  }

  formats::TransResult trans_result{};
  ret = TransVarOnHost(var_data.get(), trans_road, trans_result);
  if (ret != SUCCESS) {
    GELOGE(ret, "Failed to trans var data on host, error code %u", ret);
    return ret;
  }

  void *var_device = nullptr;

  /// It is a temporary solution to use the last GeTensorDesc to assign variable memory because the variable manager
  /// depends on TensorDesc and it is difficult to be modified. The correct solution is to assign memory based on the
  /// size of the converted variable. To complete the final solution, the dependency of the variable manager on
  /// TensorDesc needs to be removed. This change is large and needs to be performed step by step.
  ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device);
  if (ret != SUCCESS) {
    GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length);
    return ret;
  }

  // sync new data to device
  ret = CopyVarToDevice(var, trans_result, var_device);
  if (ret != SUCCESS) {
    GELOGE(ret, "Failed to send var data to device");
    return ret;
  }

  return SUCCESS;
 }

 inline bool IsDataOp(const std::string &node_type) {
  return node_type == DATA_TYPE || node_type == AIPP_DATA_TYPE || node_type == ANN_DATA_TYPE;
 }
@@ -474,6 +207,14 @@ DavinciModel::~DavinciModel() {
    CleanTbeHandle();

    var_mem_base_ = nullptr;
    if (known_node_) {
      if (args_ != nullptr) {
        GE_CHK_RT(rtFree(args_));
      }
      if (args_host_ != nullptr) {
        GE_CHK_RT(rtFreeHost(args_host_));
      }
    }
  } catch (...) {
    GELOGW("DavinciModel::~DavinciModel: clear op_list catch exception.");
  }
@@ -574,6 +315,14 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
    GELOGI("copy weights data to device");
  }

  GE_CHK_STATUS_RET(InitVariableMem(), "init variable mem failed.");
  runtime_param_.mem_base = mem_base_;
  runtime_param_.weight_base = weights_mem_base_;
  return SUCCESS;
 }

 Status DavinciModel::InitVariableMem() {
  // malloc variable memory base
  var_mem_base_ = VarManager::Instance(session_id_)->GetVarMemoryBase(RT_MEMORY_HBM);
  if (TotalVarMemSize() && var_mem_base_ == nullptr) {
    Status ret = VarManager::Instance(session_id_)->MallocVarMemory(TotalVarMemSize());
@@ -582,12 +331,9 @@ Status DavinciModel::InitModelMem(void *dev_ptr, size_t mem_size, void *weight_p
      return ret;
    }
    var_mem_base_ = VarManager::Instance(session_id_)->GetVarMemoryBase(RT_MEMORY_HBM);
    GELOGI("[IMAS]InitModelMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
    GELOGI("[IMAS]InitVariableMem graph_%u MallocMemory type[V] memaddr[%p] mem_size[%zu]", runtime_param_.graph_id,
           var_mem_base_, TotalVarMemSize());
  }

  runtime_param_.mem_base = mem_base_;
  runtime_param_.weight_base = weights_mem_base_;
  runtime_param_.var_base = var_mem_base_;
  return SUCCESS;
 }
@@ -618,11 +364,15 @@ void DavinciModel::InitRuntimeParams() {
  ret = ge::AttrUtils::GetInt(ge_model_, ATTR_MODEL_VAR_SIZE, value);
  runtime_param_.var_size = ret ? (uint64_t)value : 0;
  session_id_ = runtime_param_.session_id;
  GELOGI("InitRuntimeParams(), memory_size:%lu, weight_size:%lu, stream_num:%u, session_id:%u, var_size:%lu.",
         runtime_param_.mem_size, runtime_param_.weight_size, runtime_param_.stream_num, runtime_param_.session_id,
         runtime_param_.var_size);

  GELOGI("InitRuntimeParams(), event_num:%u, label_num:%u", runtime_param_.event_num, runtime_param_.label_num);
  GELOGI(
    "InitRuntimeParams(), memory_size:%lu, weight_size:%lu, session_id:%u, var_size:%lu, logic_var_base:%lu, "
    "logic_mem_base:%lu.",
    runtime_param_.mem_size, runtime_param_.weight_size, runtime_param_.session_id, runtime_param_.var_size,
    runtime_param_.logic_var_base, runtime_param_.logic_mem_base);

  GELOGI("InitRuntimeParams(), stream_num:%lu, event_num:%u, label_num:%u", runtime_param_.stream_num,
         runtime_param_.event_num, runtime_param_.label_num);
 }

 void DavinciModel::CheckHasHcomOp() {
@@ -639,7 +389,9 @@ void DavinciModel::CheckHasHcomOp() {
    GE_IF_BOOL_EXEC(op_desc == nullptr, GELOGW("Node OpDesc is nullptr"); continue);
    GE_IF_BOOL_EXEC(((op_desc->GetType() == HCOMBROADCAST) || (op_desc->GetType() == HCOMALLGATHER) ||
                     (op_desc->GetType() == HCOMALLREDUCE) || (op_desc->GetType() == HCOMSEND) ||
                     (op_desc->GetType() == HCOMRECEIVE) || (op_desc->GetType() == HCOMREDUCESCATTER)),
                     (op_desc->GetType() == HCOMRECEIVE) || (op_desc->GetType() == HCOMREDUCESCATTER) ||
                     (op_desc->GetType() == HVDCALLBACKALLREDUCE) || (op_desc->GetType() == HVDCALLBACKALLGATHER) ||
                     (op_desc->GetType() == HVDCALLBACKBROADCAST) || (op_desc->GetType() == HVDWAIT)),
                    uint32_t stream_id = static_cast<uint32_t>(op_desc->GetStreamId());
                    (void)hcom_streams_.emplace(stream_id); GELOGD("hcom stream: %u.", stream_id); continue);

@@ -692,6 +444,10 @@ Status DavinciModel::DoTaskSink() {
    GELOGI("do task_sink.");
    GE_CHK_STATUS_RET(BindModelStream(), "Bind model stream failed.");

    if (known_node_) {
      GE_CHK_STATUS_RET(MallocKnownArgs(), "Mallloc known node args failed.");
    }

    GE_CHK_STATUS_RET(InitTaskInfo(*model_task_def_.get()), "InitTaskInfo failed.");

    GE_CHK_STATUS_RET(LoadWithQueue(), "LoadWithQueue failed.");
@@ -787,12 +543,14 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
  GE_CHK_STATUS_RET(CopyVarData(compute_graph_), "copy var data failed.");

  GE_TIMESTAMP_START(InitModelMem);
  GE_CHK_STATUS_RET_NOLOG(InitModelMem(dev_ptr, mem_size, weight_ptr, weight_size));
  GELOGI("known_node is %d", known_node_);
  if (!known_node_) {
    GE_CHK_STATUS_RET_NOLOG(InitModelMem(dev_ptr, mem_size, weight_ptr, weight_size));
    data_inputer_ = new (std::nothrow) DataInputer();
    GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr.");
  }
  GE_TIMESTAMP_END(InitModelMem, "GraphLoader::InitModelMem");

  data_inputer_ = new (std::nothrow) DataInputer();
  GE_CHK_BOOL_RET_STATUS(data_inputer_ != nullptr, INTERNAL_ERROR, "data_inputer_ is nullptr.");

  for (const ge::NodePtr &node : compute_graph_->GetDirectNode()) {
    GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr, continue);
    GE_IF_BOOL_EXEC(node->GetOpDesc()->GetType() != VARIABLE, continue);
@@ -817,7 +575,6 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
  }

  SetDataDumperArgs();

  GE_TIMESTAMP_START(DoTaskSink);
  auto ret = DoTaskSink();
  GE_TIMESTAMP_END(DoTaskSink, "GraphLoader::DoTaskSink");
@@ -832,6 +589,7 @@ Status DavinciModel::Init(void *dev_ptr, size_t mem_size, void *weight_ptr, size
    }
    ProfilingManager::Instance().ReportProfilingData(GetTaskDescInfo(), compute_graph_desc_info);
  }
  GELOGI("davinci model init success.");
  return ret;
 }

@@ -935,6 +693,10 @@ Status DavinciModel::InitNodes(const ComputeGraphPtr &compute_graph) {
 Status DavinciModel::InitDataOp(const NodePtr &node, uint32_t &data_op_index) {
  // op_desc Checked by Init: Data, valid.
  auto op_desc = node->GetOpDesc();
  if (known_node_) {
    data_op_list_.push_back(op_desc);
    return SUCCESS;
  }
  uint32_t parent_index = 0;  // Ignore subgraph Data Node.
  if (AttrUtils::GetInt(op_desc, ATTR_NAME_PARENT_NODE_INDEX, parent_index)) {
    GELOGI("Skip subgraph Data node: %s.", op_desc->GetName().c_str());
@@ -1015,6 +777,10 @@ Status DavinciModel::InitInputZeroCopyNodes(const NodePtr &node) {
 Status DavinciModel::InitNetOutput(const NodePtr &node) {
  // node->GetOpDesc Checked by Init: NetOutput, valid.
  auto op_desc = node->GetOpDesc();
  if (known_node_) {
    output_op_list_.push_back(op_desc);
    return SUCCESS;
  }
  ComputeGraphPtr owner_graph = node->GetOwnerComputeGraph();
  GE_CHECK_NOTNULL(owner_graph);
  if (owner_graph->GetParentGraph() != nullptr) {
@@ -1024,7 +790,6 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) {
  }

  output_op_list_.push_back(op_desc);

  // Make information for copy output data.
  const vector<int64_t> input_size_list = ModelUtils::GetInputSize(op_desc);
  const vector<void *> virtual_addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, op_desc, false);
@@ -1048,6 +813,7 @@ Status DavinciModel::InitNetOutput(const NodePtr &node) {
    GELOGE(PARAM_INVALID, "Output zero copy nodes init failed!");
    return PARAM_INVALID;
  }
  GELOGI("DavinciModel::InitNetoutput success.");
  return SUCCESS;
 }

@@ -1605,7 +1371,9 @@ Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
  for (size_t i = 0; i < output_op_list_.size(); i++) {
    auto &op_desc = output_op_list_[i];
    uint32_t out_size = static_cast<uint32_t>(op_desc->GetInputsSize());

    // get real out nodes from model
    vector<std::string> out_node_name;
    (void)ge::AttrUtils::GetListStr(ge_model_, ATTR_MODEL_OUT_NODES_NAME, out_node_name);
    for (uint32_t index = 0; index < out_size; index++) {
      string output_name;
      InputOutputDescInfo output;
@@ -1616,10 +1384,14 @@ Status DavinciModel::GetOutputDescInfo(vector<InputOutputDescInfo> &output_desc,
      std::vector<int64_t> src_index = op_desc->GetSrcIndex();
      GE_CHK_BOOL_RET_STATUS(src_name.size() > index && src_index.size() > index, INTERNAL_ERROR,
                             "construct output_name failed.");
      output_name =
        std::string("output_") + std::to_string(index) + "_" + src_name[index] + "_" + std::to_string(src_index[index]);
      // forward compatbility, if old om has no out_node_name, need to return output follow origin way
      if (out_size == out_node_name.size()) {
        output_name = out_node_name[index] + ":" + std::to_string(src_index[index]);
      } else {
        output_name = std::string("output_") + std::to_string(index) + "_" + src_name[index] + "_" +
                      std::to_string(src_index[index]);
      }
      output.name = output_name;

      output_desc.push_back(output);
      formats.push_back(format_result);
    }
@@ -1653,8 +1425,8 @@ Status DavinciModel::CopyInputData(const InputData &input_data, bool device_data
                           "input data size(%u) does not match model required size(%u), ret failed.", data_buf.length,
                           mem_size);

    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] input[%u] memaddr[%p] mem_size[%u] datasize[%u]",
           runtime_param_.graph_id, data.first, mem_addr, mem_size, data_buf.length);
    GELOGI("[IMAS]CopyPlainData memcpy graph_%u type[F] input[%u] dst[%p] src[%p] mem_size[%u] datasize[%u]",
           runtime_param_.graph_id, data.first, mem_addr, data_buf.data, mem_size, data_buf.length);
    if (data_buf.length == 0) {
      GELOGW("No data need to memcpy!");
      return SUCCESS;
@@ -2000,7 +1772,7 @@ Status DavinciModel::CopyOutputData(uint32_t data_id, OutputData &output_data) {
    uint32_t output_data_index = 0;
    for (auto &op_desc : output_op_list_) {
      ret = CopyOutputDataToUser(op_desc, output_data.blobs, output_data_index);
      GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy input data to model ret failed, index:%u, model id:%u",
      GE_CHK_BOOL_EXEC(ret == SUCCESS, break, "Copy output data to model ret failed, index:%u, model id:%u",
                       output_data.index, output_data.model_id);
    }
  }
@@ -2032,8 +1804,10 @@ Status DavinciModel::CopyOutputDataToUser(OpDescPtr &op_desc, std::vector<DataBu
                           "Model output data size(%u) does not match required size(%u).", v_output_size[i],
                           data_buf.length);

    GELOGI("CopyOutputDataToUser memcpy graph_%u type[F] name[%s] output[%lu] memaddr[%p] mem_size[%u] datasize[%u]",
           runtime_param_.graph_id, op_desc->GetName().c_str(), i, data_buf.data, data_buf.length, v_output_size[i]);
    GELOGI(
      "CopyOutputDataToUser memcpy graph_%u type[F] name[%s] output[%lu] dst[%p] src[%p] mem_size[%u] datasize[%u]",
      runtime_param_.graph_id, op_desc->GetName().c_str(), i, data_buf.data, v_output_data_addr[i], data_buf.length,
      v_output_size[i]);
    GE_CHK_RT_RET(rtMemcpy(data_buf.data, size, v_output_data_addr[i], size, RT_MEMCPY_DEVICE_TO_DEVICE));
  }

@@ -2104,14 +1878,9 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
                                  OutputData *output_data) {
  GE_CHK_BOOL_EXEC(listener_ != nullptr, return PARAM_INVALID, "listener_ is null.");
  std::vector<ge::OutputTensorInfo> outputs;
  if (seq_end_flag) {
    GELOGW("End of sequence, model id: %u", model_id_);
    GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, END_OF_SEQUENCE, outputs), "OnComputeDone failed");
    return END_OF_SEQUENCE;
  }

  // return result is not required
  if (!rslt_flg) {
  if (!rslt_flg && !seq_end_flag) {
    GELOGW("Compute failed, model id: %u", model_id_);
    GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, INTERNAL_ERROR, outputs), "OnComputeDone failed.");
    return INTERNAL_ERROR;
@@ -2146,7 +1915,11 @@ Status DavinciModel::ReturnResult(uint32_t data_id, const bool rslt_flg, const b
  }

  GE_IF_BOOL_EXEC((DumpOpInputOutput() != SUCCESS), GELOGW("dump op failed, model_id: %u", model_id_););

  if (seq_end_flag) {
    GELOGW("End of sequence, model id: %u", model_id_);
    GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, END_OF_SEQUENCE, outputs), "OnCompute Done failed.");
    return END_OF_SEQUENCE;
  }
  GE_CHK_STATUS(listener_->OnComputeDone(model_id_, data_id, SUCCESS, outputs), "OnComputeDone failed");
  return SUCCESS;
 }
@@ -2493,6 +2266,87 @@ void DavinciModel::UnbindTaskSinkStream() {
  return;
 }

 Status DavinciModel::CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs) {
  GELOGI("DavinciModel::CreateKnownZeroCopyMap in.");
  if (inputs.size() != data_op_list_.size()) {
    GELOGE(FAILED, "input data addr %u is not equal to input op number %u.", inputs.size(), data_op_list_.size());
    return FAILED;
  }
  for (size_t i = 0; i < data_op_list_.size(); ++i) {
    const vector<void *> addr_list = ModelUtils::GetOutputDataAddrs(runtime_param_, data_op_list_[i]);
    knonw_input_data_info_[addr_list[kDataIndex]] = inputs[i];
    GELOGI("DavinciModel::CreateKnownZeroCopyMap input %d,v addr %p,p addr %p .", i, addr_list[kDataIndex], inputs[i]);
  }
  if (output_op_list_.size() != kOutputNum) {
    GELOGE(FAILED, "output op num is %u, not equal %u.", outputs.size(), kOutputNum);
    return FAILED;
  }
  const vector<void *> addr_list = ModelUtils::GetInputDataAddrs(runtime_param_, output_op_list_[kDataIndex]);
  if (outputs.size() != addr_list.size()) {
    GELOGE(FAILED, "output data addr %u is not equal to output op number %u.", outputs.size(), addr_list.size());
    return FAILED;
  }
  for (size_t i = 0; i < addr_list.size(); ++i) {
    knonw_output_data_info_[addr_list[i]] = outputs[i];
    GELOGI("DavinciModel::CreateKnownZeroCopyMap output %d,v addr %p,p addr %p .", i, addr_list[i], outputs[i]);
  }
  GELOGI("DavinciModel::CreateKnownZeroCopyMap success.");
  return SUCCESS;
 }

 Status DavinciModel::UpdateKnownZeroCopyAddr(vector<void *> &io_addrs, uint32_t args_offset) {
  for (size_t i = 0; i < io_addrs.size(); ++i) {
    auto it_in = knonw_input_data_info_.find(io_addrs[i]);
    if (it_in != knonw_input_data_info_.end()) {
      GELOGI("DavinciModel::UpdateKnownZeroCopyAddr input %d,v addr %p,p addr %p .", i, io_addrs[i],
             knonw_input_data_info_.at(io_addrs[i]));
      io_addrs[i] = knonw_input_data_info_.at(io_addrs[i]);
    }
    auto it_out = knonw_output_data_info_.find(io_addrs[i]);
    if (it_out != knonw_output_data_info_.end()) {
      GELOGI("DavinciModel::UpdateKnownZeroCopyAddr output %d,v addr %p,p addr %p .", i, io_addrs[i],
             knonw_output_data_info_.at(io_addrs[i]));
      io_addrs[i] = knonw_output_data_info_.at(io_addrs[i]);
    }
  }
  // may args_size is equal to src_args_size?
  uint32_t src_args_size = io_addrs.size() * sizeof(uint64_t);
  GELOGI("DavinciModel::UpdateKnownZeroCopyAddr args host %p, src_args_size %u, args_offset %u", args_host_,
         src_args_size, args_offset);
  errno_t sec_ret =
    memcpy_s(static_cast<char *>(args_host_) + args_offset, src_args_size, io_addrs.data(), src_args_size);
  if (sec_ret != EOK) {
    GELOGE(FAILED, "Call memcpy_s failed, ret: %d", sec_ret);
    return FAILED;
  }
  GELOGI("DavinciModel::UpdateKnownZeroCopyAddr success.");
  return SUCCESS;
 }

 Status DavinciModel::UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs) {
  GELOGI("DavinciModel::UpdateKnownNodeArgs in");
  GE_CHK_STATUS_RET(CreateKnownZeroCopyMap(inputs, outputs),
                    "DavinciModel::UpdateKnownNodeArgs create map for input/output zero copy.");
  for (size_t task_index = 0; task_index < task_list_.size(); ++task_index) {
    auto &task = task_list_[task_index];
    if (task != nullptr) {
      Status ret = task->UpdateArgs();
      if (ret != SUCCESS) {
        GELOGE(FAILED, "task %d created by davinci model is nullptr.", task_index);
        return FAILED;
      }
    }
  }
  GELOGI("DavinciModel::UpdateKnownNodeArgs device args %p, size %u, host args %p, size %u", args_, total_args_size_,
         args_host_, total_args_size_);
  // copy continuous args from host to device
  Status rt_ret = rtMemcpy(args_, total_args_size_, args_host_, total_args_size_, RT_MEMCPY_HOST_TO_DEVICE);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;)

  GELOGI("DavinciModel::UpdateKnownNodeArgs success");
  return SUCCESS;
 }

 Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) {
  GELOGI("InitTaskInfo in,task size %zu", model_task_def.task().size());
  task_list_.resize(model_task_def.task_size());
@@ -2513,13 +2367,13 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) {
          GELOGE(RT_FAILED, "Failed to set context from rt, error-code 0x%X.", rt_ret);
          return RT_FAILED;
        }

        model->task_list_[idx] = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task.type()));

        Status ret = FAILED;
        if (model->task_list_[idx] != nullptr) {
          ret = model->task_list_[idx]->Init(task, model);
        // dynamic shape will create task_list_ before
        if (model->task_list_[idx] == nullptr) {
          model->task_list_[idx] = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(task.type()));
          GE_CHECK_NOTNULL(model->task_list_[idx]);
        }
        ret = model->task_list_[idx]->Init(task, model);
        return ret;
      },
      model_task_def.task(i), this, ctx, i);
@@ -2543,6 +2397,39 @@ Status DavinciModel::InitTaskInfo(domi::ModelTaskDef &model_task_def) {
  return SUCCESS;
 }

 Status DavinciModel::MallocKnownArgs() {
  GELOGI("DavinciModel::MallocKnownArgs in");
  if (model_task_def_->task_size() == 0) {
    GELOGW("DavinciModel::MallocKnownArgs davincimodel has no task info.");
    return SUCCESS;
  }
  task_list_.resize(model_task_def_->task_size());
  for (int32_t i = 0; i < model_task_def_->task_size(); ++i) {
    const domi::TaskDef &taskdef = model_task_def_->task(i);
    task_list_[i] = TaskInfoFactory::Instance().Create(static_cast<rtModelTaskType_t>(taskdef.type()));
    GE_CHECK_NOTNULL(task_list_[i]);
    Status ret = task_list_[i]->CalculateArgs(taskdef, this);
    if (ret != SUCCESS) {
      GELOGE(ret, "TaskInfo CalculateArgs failed.");
      return ret;
    }
  }
  // malloc args memory
  rtError_t rt_ret = rtMalloc(&args_, total_args_size_, RT_MEMORY_HBM);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rtMalloc failed, ret: 0x%X", rt_ret);
    return RT_FAILED;
  }
  // malloc args host memory
  rt_ret = rtMallocHost(&args_host_, total_args_size_);
  if (rt_ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Call rtMallocHost failed, ret: 0x%X", rt_ret);
    return RT_FAILED;
  }
  GELOGI("DavinciModel::MallocKnownArgs success, total args size %u.", total_args_size_);
  return SUCCESS;
 }

 Status DavinciModel::DistributeTask() {
  GELOGI("do Distribute.");
  for (auto &task : cpu_task_list_) {
@@ -3117,7 +3004,7 @@ bool DavinciModel::IsBroadCastOpData(const ge::NodePtr &var_node) {
      GE_RT_FALSE_CHECK_NOTNULL(in_anchor);
      ge::NodePtr dst_node = in_anchor->GetOwnerNode();
      GE_RT_FALSE_CHECK_NOTNULL(dst_node);
      if (dst_node->GetType() == HCOMBROADCAST) {
      if (dst_node->GetType() == HCOMBROADCAST || dst_node->GetType() == HVDCALLBACKBROADCAST) {
        return true;
      }
    }
@@ -3126,32 +3013,15 @@ bool DavinciModel::IsBroadCastOpData(const ge::NodePtr &var_node) {
 }

 void DavinciModel::InitZeroCopyUtil(bool is_dynamic_batch, bool &input_zero_copy, bool &output_zero_copy) {
  auto dump_path = PropertiesManager::Instance().GetDumpOutputPath();
  auto enable_dump = !dump_path.empty();

  auto dump_op_env = std::getenv("DUMP_OP");
  if (dump_op_env != nullptr) {
    string dump_op_flag(dump_op_env);
    if (dump_op_flag == "1") {
      enable_dump = true;
    }
  }

  GELOGI("dump path: %s, dump_op_env: %s", dump_path.c_str(), dump_op_env);
  if (!is_dynamic_batch) {
    zero_copy_batch_label_addrs_.clear();
  }

  if (enable_dump) {
    input_zero_copy = false;
    output_zero_copy = false;
  } else {
    for (const auto &addrs : output_outside_addrs_) {
      const auto &used_list = addrs.second;
      if (used_list.empty()) {
        output_zero_copy = false;
        break;
      }
  for (const auto &addrs : output_outside_addrs_) {
    const auto &used_list = addrs.second;
    if (used_list.empty()) {
      output_zero_copy = false;
      break;
    }
  }
 }
@@ -3244,11 +3114,11 @@ Status DavinciModel::NnExecute(rtStream_t stream, bool async_mode, const InputDa
  return SUCCESS;
 }

 uint8_t *DavinciModel::MallocFeatureMapMem(uint64_t data_size) {
 uint8_t *DavinciModel::MallocFeatureMapMem(size_t data_size) {
  uint8_t *mem_base = nullptr;
  const string purpose("feature map,used for op input and output.");
  if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
    data_size = static_cast<uint64_t>(VarManager::Instance(0)->GetGraphMemoryMaxSize());
    data_size = static_cast<size_t>(VarManager::Instance(0)->GetGraphMemoryMaxSize());
    string memory_key = std::to_string(0) + "_f";
    mem_base = MemManager::Instance(RT_MEMORY_HBM)->MallocMemory(purpose, memory_key, data_size, GetDeviceId());
  } else {
@@ -3261,7 +3131,7 @@ uint8_t *DavinciModel::MallocFeatureMapMem(uint64_t data_size) {
  return mem_base;
 }

 uint8_t *DavinciModel::MallocWeightsMem(uint32_t weights_size) {
 uint8_t *DavinciModel::MallocWeightsMem(size_t weights_size) {
  uint8_t *weights_mem_base = nullptr;
  const string purpose("weights memory in inference network.");
  if (std::getenv(kEnvGeuseStaticMemory) != nullptr) {
@@ -3319,10 +3189,6 @@ uint32_t DavinciModel::GetGraphID(const std::string &session_graph_id) {

 Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id) {
  GELOGI("TransAllVarData start: session_id:%lu, graph_id: %u.", session_id_, graph_id);

  ThreadPool executor(kThreadNum);
  std::vector<std::future<Status>> vector_future;

  rtContext_t ctx = nullptr;
  rtError_t rt_ret = rtCtxGetCurrent(&ctx);
  if (rt_ret != RT_ERROR_NONE) {
@@ -3330,6 +3196,7 @@ Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id)
    return RT_FAILED;
  }

  std::vector<NodePtr> variable_node_list;
  for (ge::NodePtr &node : graph->GetDirectNode()) {
    if (node == nullptr) {
      continue;
@@ -3337,63 +3204,13 @@ Status DavinciModel::TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id)
    if (node->GetType() != VARIABLE) {
      continue;
    }
    std::future<Status> f = executor.commit(
      [](ge::NodePtr &node, DavinciModel *model, rtContext_t ctx, uint32_t graph_id) -> Status {
        if (model == nullptr) {
          GELOGE(FAILED, "DavinciModel is NULL!");
          return FAILED;
        }
        rtError_t rt_ret = rtCtxSetCurrent(ctx);
        if (rt_ret != RT_ERROR_NONE) {
          GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret);
          return RT_FAILED;
        }
        uint32_t allocated_graph_id = 0;
        Status ret = VarManager::Instance(model->session_id_)->GetAllocatedGraphId(node->GetName(), allocated_graph_id);
        if (ret != SUCCESS) {
          GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(),
                 graph_id);
          return INTERNAL_ERROR;
        }
        uint32_t changed_graph_id = 0;
        ret = VarManager::Instance(model->session_id_)->GetChangedGraphId(node->GetName(), changed_graph_id);
        bool call_trans_var =
          (ret == SUCCESS && changed_graph_id == graph_id && changed_graph_id != allocated_graph_id);
        if (call_trans_var) {
          GELOGI("VarManager::GetChangedGraphId() success, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id);
          VarTransRoad *trans_road = VarManager::Instance(model->session_id_)->GetTransRoad(node->GetName());
          if (trans_road == nullptr) {
            GELOGI("The variable %s does not have any trans road", node->GetName().c_str());
            return SUCCESS;
          }
          ret = TransVarData(node, *trans_road, model->session_id_);
          if (ret != SUCCESS) {
            GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id);
            return INTERNAL_ERROR;
          }
          VarManager::Instance(model->session_id_)->RemoveChangedGraphId(node->GetName());
        }
        return SUCCESS;
      },
      node, this, ctx, graph_id);
    if (!f.valid()) {
      GELOGE(FAILED, "Future is invalid");
      return FAILED;
    }
    vector_future.push_back(std::move(f));
    variable_node_list.emplace_back(node);
  }

  Status ret_status;
  for (size_t i = 0; i < vector_future.size(); ++i) {
    ret_status = vector_future[i].get();
    if (ret_status != SUCCESS) {
      GELOGE(ret_status, "TransAllVarData:: trans %zu vardata failed", i);
      return ret_status;
    }
  }
  GE_CHK_STATUS_RET_NOLOG(
    TransVarDataUtils::TransAllVarData(variable_node_list, session_id_, ctx, graph_id, kThreadNum));

  GELOGI("TransAllVarData success.");

  return SUCCESS;
 }

@@ -3457,96 +3274,8 @@ void DavinciModel::ReuseHcclFollowStream(int64_t remain_cap, int64_t &index) {
  }
 }

 Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var_dst, formats::TransResult &result) {
  GE_CHECK_NOTNULL(var_src);
  GE_CHECK_NOTNULL(var_src->GetOpDesc());
  GE_CHECK_NOTNULL(var_dst);
  GE_CHECK_NOTNULL(var_dst->GetOpDesc());
  auto src_data_shape_size = var_src->GetOpDesc()->GetOutputDesc(0).GetShape().GetShapeSize();
  auto src_data_datatype = var_src->GetOpDesc()->GetOutputDesc(0).GetDataType();
  auto dst_data_datatype = var_dst->GetOpDesc()->GetOutputDesc(0).GetDataType();
  GE_IF_BOOL_EXEC(
    src_data_datatype != dst_data_datatype,
    auto ret = formats::TransDataType(
      {var_data, static_cast<size_t>(src_data_shape_size), src_data_datatype, dst_data_datatype}, result);
    if (ret != SUCCESS) {
      GELOGE(INTERNAL_ERROR, "trans var data on host failed");
      return ret;
    });
  return SUCCESS;
 }

 Status DavinciModel::CopyTensorFromSrcVarNode(const NodePtr &var_src, const NodePtr &var_dst) {
  /// after FE fusion pass, input num of applymomentum op was changed, 0th input is var_fp32, 6th input is
  /// var_fp16(new).
  /// unlink edges between var_fp32 and "dst_node" (need fp16) of var_fp32, add edge between var_fp16 and dst_node.
  /// need copy value from var_fp32 to var_fp16.
  /// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr]
  GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, GELOGE(FAILED, "node var is nullptr"); return FAILED);
  // src_node output_desc (fp32)
  GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0);
  auto src_data_type = output_desc.GetDataType();
  auto src_shape = output_desc.GetShape();
  auto src_format = output_desc.GetFormat();
  GELOGI("src_node %s, src_format %s, src_shape %s, src_type %s", var_src->GetName().c_str(),
         TypeUtils::FormatToSerialString(src_format).c_str(), formats::ShapeToString(src_shape).c_str(),
         TypeUtils::DataTypeToSerialString(src_data_type).c_str());
  // dst_node output_desc (fp16)
  GeTensorDesc dst_tensor_desc = var_dst->GetOpDesc()->GetOutputDesc(0);
  auto data_type = dst_tensor_desc.GetDataType();
  auto data_shape = dst_tensor_desc.GetShape();
  auto data_format = dst_tensor_desc.GetFormat();
  GELOGI("dst_node %s, src_format %s, src_shape %s, src_type %s", var_dst->GetName().c_str(),
         TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(),
         TypeUtils::DataTypeToSerialString(data_type).c_str());
  // Sync var data from device
  std::unique_ptr<uint8_t[]> var_src_data;
  RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id_);
  // copy from src_node
  auto ret = CopyVarFromDevice(session_id_, var_src, var_src_data, output_desc);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "Copy Var From Device failed"); return ret);
  // trans dtype
  formats::TransResult trans_result;
  ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret);
  // reset src value.
  void *var_device = nullptr;
  ret = ReAssignVarAddr(session_id_, var_dst->GetName(), dst_tensor_desc, &var_device);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "assign mem failed"); return ret);
  // copy to device
  ret = CopyVarToDevice(var_dst, trans_result, var_device);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Failed to send var data to device"); return ret);
  return SUCCESS;
 }

 Status DavinciModel::CopyVarData(ComputeGraphPtr &compute_graph) {
  GELOGI("CopyVarData start: session_id:%lu.", session_id_);
  if (compute_graph == nullptr) {
    GELOGE(FAILED, "compute_graph is nullptr");
    return FAILED;
  }

  string cp_from_node;
  bool copy_value = false;
  for (auto &node : compute_graph->GetAllNodes()) {
    GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr || node->GetOpDesc()->GetType() != VARIABLE, continue);
    GE_IF_BOOL_EXEC(ge::AttrUtils::GetStr(node->GetOpDesc(), "_copy_from_var_node", cp_from_node),
                    GELOGI("Get original type of cp_from_node"));
    if (cp_from_node.length() != 0) {
      (void)ge::AttrUtils::GetBool(node->GetOpDesc(), "_copy_value", copy_value);  // no need to check value
      if (!copy_value) {
        auto src_node = compute_graph->FindNode(cp_from_node);
        GE_CHECK_NOTNULL(src_node);
        GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(),
               src_node->GetName().c_str());
        auto ret = CopyTensorFromSrcVarNode(src_node, node);
        GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED);
        // only copy once
        (void)ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true);  // no need to check value
      }
    }
  }
  return SUCCESS;
  return TransVarDataUtils::CopyVarData(compute_graph, session_id_, device_id_);
 }

 Status DavinciModel::GetComputeGraphInfo(std::vector<ComputeGraphDescInfo> &compute_graph_desc_info) {
--- a/src/ge/graph/load/new_model_manager/davinci_model.h
+++ b/src/ge/graph/load/new_model_manager/davinci_model.h
@@ -250,6 +250,11 @@ class DavinciModel {
    return res;
  }

  rtStream_t GetRtModelStream() {
    rtModel_t res = rt_model_stream_;
    return res;
  }

  uint64_t GetRtBaseAddr() const { return runtime_param_.logic_mem_base; }

  uint64_t GetRtWeightAddr() const { return runtime_param_.logic_weight_base; }
@@ -427,6 +432,26 @@ class DavinciModel {
  void CreateHcclFollowStream(rtStream_t stream, int64_t remain_cap);
  void ReuseHcclFollowStream(int64_t remain_cap, int64_t &index);

  void InitRuntimeParams();
  Status InitVariableMem();

  void UpdateMemBase(uint8_t *mem_base) {
    runtime_param_.mem_base = mem_base;
    mem_base_ = mem_base;
  }
  void SetTotalArgsSize(uint32_t args_size) { total_args_size_ += args_size; }
  uint32_t GetTotalArgsSize() { return total_args_size_; }
  void *GetCurrentArgsAddr(uint32_t offset) {
    void *cur_args = static_cast<char *>(args_) + offset;
    return cur_args;
  }
  void SetKnownNode(bool known_node) { known_node_ = known_node; }
  bool IsKnownNode() { return known_node_; }
  Status MallocKnownArgs();
  Status UpdateKnownNodeArgs(const vector<void *> &inputs, const vector<void *> &outputs);
  Status CreateKnownZeroCopyMap(const vector<void *> &inputs, const vector<void *> &outputs);
  Status UpdateKnownZeroCopyAddr(vector<void *> &io_addrs, uint32_t args_offset);

 private:
  // memory address of weights
  uint8_t *weights_mem_base_;
@@ -523,9 +548,9 @@ class DavinciModel {

  Status DistributeTask();

  uint8_t *MallocFeatureMapMem(uint64_t data_size);
  uint8_t *MallocFeatureMapMem(size_t data_size);

  uint8_t *MallocWeightsMem(uint32_t weights_size);
  uint8_t *MallocWeightsMem(size_t weights_size);

  void FreeFeatureMapMem();

@@ -690,8 +715,6 @@ class DavinciModel {
  ///
  Status CpuModelRepeat();

  void InitRuntimeParams();

  ///
  /// @ingroup ge
  /// @brief set ts device.
@@ -709,7 +732,6 @@ class DavinciModel {

  Status TransAllVarData(ComputeGraphPtr &graph, uint32_t graph_id);
  Status CopyVarData(ComputeGraphPtr &graph);
  Status CopyTensorFromSrcVarNode(const NodePtr &var_src, const NodePtr &var_dst);

  // get desc info of graph for profiling
  Status GetComputeGraphInfo(vector<ComputeGraphDescInfo> &compute_graph_desc_info);
@@ -827,6 +849,13 @@ class DavinciModel {
  DataDumper data_dumper_;
  uint64_t iterator_count_;
  bool is_l1_fusion_enable_;

  bool known_node_ = false;
  uint32_t total_args_size_ = 0;
  void *args_ = nullptr;
  void *args_host_ = nullptr;
  std::map<const void *, void *> knonw_input_data_info_;
  std::map<const void *, void *> knonw_output_data_info_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_DAVINCI_MODEL_H_
--- a/src/ge/graph/load/new_model_manager/model_manager.cc
+++ b/src/ge/graph/load/new_model_manager/model_manager.cc
@@ -24,6 +24,7 @@
 #include "framework/common/debug/ge_log.h"
 #include "graph/load/new_model_manager/davinci_model.h"
 #include "graph/load/new_model_manager/davinci_model_parser.h"
 #include "model/ge_root_model.h"

 namespace ge {
 thread_local uint32_t device_count = 0;
@@ -68,8 +69,6 @@ Status ModelManager::KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, u
                      GE_CHK_RT(rtFree(aicpu_kernel_addr)); return FAILED;)
      uint64_t kernel_id_addr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(aicpu_kernel_addr));
      param_base.fwkKernelBase.fwk_kernel.kernelID = kernel_id_addr;
      // Remove model key from map
      model_aicpu_kernel_.erase(iter);
    }
  }

@@ -214,18 +213,38 @@ Status ModelManager::SetDevice(int32_t deviceId) const {
  return SUCCESS;
 }

 ge::Status ModelManager::DoLoadHybridModelOnline(uint32_t model_id, const shared_ptr<ge::GeRootModel> &ge_root_model,
                                                 const shared_ptr<ModelListener> &listener) {
  auto hybrid_model = hybrid::HybridDavinciModel::Create(ge_root_model);
  GE_CHECK_NOTNULL(hybrid_model);
  hybrid_model->SetListener(listener);
  hybrid_model->SetModelId(model_id);
  hybrid_model->SetDeviceId(GetContext().DeviceId());
  GE_CHK_STATUS_RET(hybrid_model->Init(), "Failed to init hybrid model. model_id = %u", model_id);
  auto shared_model = std::shared_ptr<hybrid::HybridDavinciModel>(hybrid_model.release());
  InsertModel(model_id, shared_model);
  return SUCCESS;
 }

 ///
 /// @ingroup domi_ome
 /// @brief load model online
 /// @return Status run result
 ///
 Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::GeModel> &ge_model,
 Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::GeRootModel> &ge_root_model,
                                     std::shared_ptr<ModelListener> listener) {
  GE_CHK_BOOL_RET_STATUS(listener.get() != nullptr, PARAM_INVALID, "Param incorrect, listener is null");
  if (model_id == INVALID_MODEL_ID) {
    GenModelId(&model_id);
  }

  bool is_shape_unknown = false;
  GE_CHK_STATUS_RET(ge_root_model->CheckIsUnknownShape(is_shape_unknown), "CheckIsUnknownShape failed, model id:%u",
                    model_id);
  if (is_shape_unknown) {
    return DoLoadHybridModelOnline(model_id, ge_root_model, listener);
  }

  GE_CHK_STATUS_RET(SetDevice(static_cast<int32_t>(GetContext().DeviceId())), "Set device failed, model id:%u.",
                    model_id);
  mmTimespec timespec = mmGetTickCount();
@@ -238,6 +257,11 @@ Status ModelManager::LoadModelOnline(uint32_t &model_id, const shared_ptr<ge::Ge
  davinci_model->SetId(model_id);
  davinci_model->SetDeviceId(GetContext().DeviceId());

  auto root_graph = ge_root_model->GetRootGraph();
  GE_CHECK_NOTNULL(root_graph);
  string root_model_name = root_graph->GetName();
  auto name_to_model = ge_root_model->GetSubgraphInstanceNameToModel();
  GeModelPtr ge_model = name_to_model[root_model_name];
  Status ret = SUCCESS;
  do {
    GE_TIMESTAMP_START(Assign);
@@ -274,16 +298,26 @@ void ModelManager::InsertModel(uint32_t id, std::shared_ptr<DavinciModel> &davin
  model_map_[id] = davinci_model;
 }

 void ModelManager::InsertModel(uint32_t id, shared_ptr<hybrid::HybridDavinciModel> &hybrid_model) {
  GE_CHK_BOOL_EXEC(hybrid_model != nullptr, return, "hybrid_model ptr is null, id: %u", id);
  std::lock_guard<std::mutex> lock(map_mutex_);
  hybrid_model_map_[id] = hybrid_model;
 }

 Status ModelManager::DeleteModel(uint32_t id) {
  std::lock_guard<std::mutex> lock(map_mutex_);

  auto it = model_map_.find(id);
  if (it == model_map_.end()) {
  auto hybrid_model_it = hybrid_model_map_.find(id);
  if (it != model_map_.end()) {
    (void)model_map_.erase(it);
  } else if (hybrid_model_it != hybrid_model_map_.end()) {
    (void)hybrid_model_map_.erase(hybrid_model_it);
  } else {
    GELOGE(PARAM_INVALID, "model id %u does not exists.", id);
    return PARAM_INVALID;
  }

  (void)model_map_.erase(it);
  return SUCCESS;
 }

@@ -294,6 +328,13 @@ std::shared_ptr<DavinciModel> ModelManager::GetModel(uint32_t id) {
  return (it == model_map_.end()) ? nullptr : it->second;
 }

 std::shared_ptr<hybrid::HybridDavinciModel> ModelManager::GetHybridModel(uint32_t id) {
  std::lock_guard<std::mutex> lock(map_mutex_);

  auto it = hybrid_model_map_.find(id);
  return (it == hybrid_model_map_.end()) ? nullptr : it->second;
 }

 Status ModelManager::Unload(uint32_t model_id) {
  GE_CHK_STATUS_RET(DeleteModel(model_id), "failed to unload model id: %u", model_id);
  if (device_count > 0) {
@@ -349,7 +390,10 @@ Status ModelManager::DataInput(const InputData &input_data, OutputData &output_d
 ///
 Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector<InputTensorInfo> &inputs) {
  std::shared_ptr<DavinciModel> model = GetModel(model_id);
  GE_CHECK_NOTNULL(model);
  auto hybrid_model = GetHybridModel(model_id);
  if (hybrid_model == nullptr) {
    GE_CHECK_NOTNULL(model);
  }

  InputData input_data;
  input_data.model_id = model_id;
@@ -374,6 +418,12 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector<InputT
  GE_CHK_STATUS_EXEC(data_wrap->Init(input_data, output_data), return domi::PUSH_DATA_FAILED,
                     "Init InputDataWrapper failed,input data model_id is : %u.", model_id);

  if (hybrid_model != nullptr) {
    GE_CHK_STATUS_RET(hybrid_model->EnqueueData(data_wrap), "Data queue is full, please call again later, model_id %u ",
                      model_id);
    return SUCCESS;
  }

  GE_CHK_BOOL_RET_STATUS(model != nullptr, PARAM_INVALID, "Invalid Model ID %u in InputData! ", model_id);

  DataInputer *inputer = model->GetDataInputer();
@@ -395,6 +445,13 @@ Status ModelManager::DataInputTensor(uint32_t model_id, const std::vector<InputT
 /// @author
 ///
 Status ModelManager::Start(uint32_t model_id) {
  auto hybrid_model = GetHybridModel(model_id);
  if (hybrid_model != nullptr) {
    GE_CHK_STATUS_RET_NOLOG(hybrid_model->ModelRunStart());
    GELOGI("Start hybrid model %u success.", model_id);
    return SUCCESS;
  }

  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);

  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid Model ID %u to start! ", model_id);
@@ -416,6 +473,13 @@ Status ModelManager::Start(uint32_t model_id) {
 /// @author
 ///
 Status ModelManager::Stop(uint32_t model_id) {
  auto hybrid_model = GetHybridModel(model_id);
  if (hybrid_model != nullptr) {
    GE_CHK_STATUS_RET_NOLOG(hybrid_model->ModelRunStop());
    GELOGI("Stop hybrid model %u success.", model_id);
    return SUCCESS;
  }

  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "Invalid Model ID %u to stop!", model_id);

@@ -581,6 +645,13 @@ Status ModelManager::HandleDumpCommand(const Command &command) {
 }

 Status ModelManager::GetMaxUsedMemory(const uint32_t model_id, uint64_t &max_size) {
  auto hybrid_model = GetHybridModel(model_id);
  if (hybrid_model != nullptr) {
    // TODO hybrid use dynamic memory allocation
    max_size = 0;
    return SUCCESS;
  }

  std::shared_ptr<DavinciModel> davinci_model = GetModel(model_id);
  GE_CHK_BOOL_RET_STATUS(davinci_model != nullptr, PARAM_INVALID, "GetMaxUsedMemory Failed, Invalid Model ID %u !",
                         model_id);
--- a/src/ge/graph/load/new_model_manager/model_manager.h
+++ b/src/ge/graph/load/new_model_manager/model_manager.h
@@ -25,6 +25,7 @@
 #include <set>
 #include <string>
 #include <vector>
 #include <model/ge_root_model.h>
 #include "cce/aicpu_engine_struct.h"
 #include "common/ge_inner_error_codes.h"
 #include "common/ge_types.h"
@@ -34,10 +35,10 @@
 #include "ge/ge_api_types.h"
 #include "graph/ge_context.h"
 #include "graph/model.h"
 #include "hybrid/hybrid_davinci_model.h"
 #include "runtime/base.h"

 namespace ge {

 class DavinciModel;

 class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
@@ -69,9 +70,12 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  /// @return Status run result
  /// @author @
  ///
  ge::Status LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge::GeModel> &model,
  ge::Status LoadModelOnline(uint32_t &model_id, const std::shared_ptr<ge::GeRootModel> &ge_root_model,
                             std::shared_ptr<ModelListener> listener);

  ge::Status DoLoadHybridModelOnline(uint32_t model_id, const shared_ptr<ge::GeRootModel> &ge_root_model,
                                     const std::shared_ptr<ModelListener> &listener);

  ///
  /// @ingroup ge
  /// @brief ACL case, Load task list with queue.
@@ -206,6 +210,8 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  ///
  std::shared_ptr<DavinciModel> GetModel(uint32_t id);

  std::shared_ptr<hybrid::HybridDavinciModel> GetHybridModel(uint32_t id);

  ge::Status KernelLaunchEx(aicpu::FWKAdapter::FWKOperateType op_type, uint64_t session_id, uint32_t model_id);

  ge::Status CreateAicpuSession(uint64_t session_id);
@@ -238,6 +244,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  /// @brief insert new model into model manager set
  ///
  void InsertModel(uint32_t id, std::shared_ptr<DavinciModel> &davinci_model);
  void InsertModel(uint32_t id, std::shared_ptr<hybrid::HybridDavinciModel> &hybrid_model);

  ///
  /// @ingroup domi_ome
@@ -248,6 +255,7 @@ class FMK_FUNC_HOST_VISIBILITY FMK_FUNC_DEV_VISIBILITY ModelManager {
  void GenModelId(uint32_t *id);

  std::map<uint32_t, std::shared_ptr<DavinciModel>> model_map_;
  std::map<uint32_t, std::shared_ptr<hybrid::HybridDavinciModel>> hybrid_model_map_;
  std::map<std::string, std::vector<uint64_t>> model_aicpu_kernel_;
  uint32_t max_model_id_;
  std::mutex map_mutex_;
--- a/src/ge/graph/load/new_model_manager/model_utils.cc
+++ b/src/ge/graph/load/new_model_manager/model_utils.cc
@@ -474,7 +474,7 @@ vector<void *> ModelUtils::GetWorkspaceDataAddrs(const RuntimeParam &model_param
      int64_t workspace_bytes = v_workspace_bytes[i];
      uint8_t *mem_addr = workspace_bytes == 0 ? nullptr : mem_base + workspace_offset;
      v_workspace_data_addr.push_back(mem_addr);
      GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] output[%zu] offset[%ld] bytes[%ld] memaddr[%p]",
      GELOGI("[IMAS]GetWorkspaceDataAddrs graph_%u type[F] name[%s] workspace[%zu] offset[%ld] bytes[%ld] memaddr[%p]",
             model_param.graph_id, op_desc->GetName().c_str(), i, workspace_offset, workspace_bytes, mem_addr);
    }
  }
--- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.cc
@@ -37,17 +37,12 @@ HcclTaskInfo::~HcclTaskInfo() {
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Call rtFree Fail, ret = 0x%X.", ret);
    }

    private_def_ = nullptr;
  }
  input_data_addr_ = nullptr;
  davinci_model_ = nullptr;
  ops_kernel_store_ = nullptr;
  output_data_addr_ = nullptr;
  workspace_addr_ = nullptr;
  max_node_of_hccl_stream_ = 0;
 }

 Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  GELOGI("HcclTaskInfo Init Start.");
  if (davinci_model == nullptr) {
@@ -55,63 +50,75 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m
    return PARAM_INVALID;
  }
  davinci_model_ = davinci_model;

  Status ret = SetStream(task_def.stream_id(), davinci_model->GetStreamList());
  if (ret != SUCCESS) {
    return ret;
  }

  GetPrivateDefByTaskDef(task_def);

  auto hccl_def = task_def.kernel_hccl();
  hcclDataType_t data_type;
  int32_t count;
  uint32_t op_index = hccl_def.op_index();
  GELOGI("HcclTaskInfo Init, op_index is: %u", op_index);
  std::string hccl_type = hccl_def.hccl_type();

  // Get HCCL op
  OpDescPtr op_desc = davinci_model->GetOpByIndex(op_index);
  GE_CHECK_NOTNULL(op_desc);

  Status dmrt = HcomOmeUtil::GetHcomDataType(op_desc, data_type);
  // Create the kernel hccl infos
  CreateKernelHcclInfo(op_desc);

  // Initialize the hccl_type of all kernel hccl info
  HcomOmeUtil::GetHcclType(task_def, kernel_hccl_infos_);

  // Only in Horovod scenario should get the inputName and GeShape
  ret = HcomOmeUtil::GetHorovodInputs(op_desc, kernel_hccl_infos_);
  if (ret != SUCCESS) {
    GELOGE(FAILED, "davinci_model: GetHorovodInputs fail! domi error: %u", ret);
    return FAILED;
  }
  Status dmrt = HcomOmeUtil::GetHcclDataType(op_desc, kernel_hccl_infos_);
  if (dmrt != SUCCESS) {
    GELOGE(FAILED, "davinci_model: GetHcomDataType fail! domi error: %u", dmrt);
    return FAILED;
  }

  dmrt = HcomOmeUtil::GetHcomCount(op_desc, data_type, (hccl_type == HCOMALLGATHER), count);
  dmrt = HcomOmeUtil::GetHcclCount(op_desc, kernel_hccl_infos_);
  if (dmrt != SUCCESS) {
    GELOGE(FAILED, "davinci_model: GetHcomCount fail! domi error: %u", dmrt);
    return FAILED;
  }

  ret = SetAddrs(hccl_type, op_desc);
  // Only HCOMBROADCAST and HVDCALLBACKBROADCAST need to get the rootId
  dmrt = HcomOmeUtil::GetAllRootId(op_desc, kernel_hccl_infos_);
  if (dmrt != SUCCESS) {
    GELOGE(FAILED, "davinci_model: Get rootId fail! domi error: %u", dmrt);
    return FAILED;
  }
  ret = SetAddrs(op_desc, kernel_hccl_infos_);
  if (ret != SUCCESS) {
    GELOGE(ret, "Setaddrs Fail.");
    return ret;
  }

  count_ = count;
  hccl_type_ = hccl_type;
  data_type_ = data_type;

  // GE's new process: hccl declares the need for Workspace size, and GE allocates Workspace
  auto workspace_bytes = op_desc->GetWorkspaceBytes();
  if (!workspace_bytes.empty()) {
    uint64_t workspace_mem_size_tmp = workspace_bytes[0];
    GELOGI("hccl need workSpaceMemSize=%lu", workspace_mem_size_tmp);
    if (workspace_mem_size_tmp != 0) {
      workspace_mem_size_ = workspace_mem_size_tmp;
      vector<void *> workspace_data_addrs =
        ModelUtils::GetWorkspaceDataAddrs(davinci_model->GetRuntimeParam(), op_desc);
      if (!workspace_data_addrs.empty()) {
        GELOGI("Get workSpaceAddr");
        workspace_addr_ = workspace_data_addrs[0];
      }
    }
  ret = SetWorkspace(op_desc, kernel_hccl_infos_);
  if (ret != SUCCESS) {
    GELOGE(ret, "SetWorkspace Fail.");
    return ret;
  }
  // GE's new process: hccl declares the number of streams required, creates a stream by GE, and sends it to hccl
  ret = SetFollowStream(op_desc, davinci_model);
  if (ret != SUCCESS) {
    GELOGE(ret, "SetStream Fail.");
    return ret;
  }

  GELOGI("HcclTaskInfo Init Success");
  return SUCCESS;
 }

 Status HcclTaskInfo::SetFollowStream(const ge::ConstOpDescPtr &op_desc, DavinciModel *davinci_model) {
  if (!HcomOmeUtil::IsHCOMOp(op_desc->GetType())) {
    GELOGI("Node %s Optye %s no need to create slave streams.", op_desc->GetName().c_str(), op_desc->GetType().c_str());
    return SUCCESS;
  }
  Status ret;
  int64_t hccl_stream_num = 0;
  if (!ge::AttrUtils::GetInt(op_desc, "used_stream_num", hccl_stream_num)) {
    GELOGI("op_desc has no attr used_stream_num!");
@@ -142,8 +149,7 @@ Status HcclTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davinci_m
      return FAILED;
    }
  }

  GELOGI("HcclTaskInfo Init Success, hcclStreamNum =%ld", hccl_stream_num);
  GELOGI("Initialize hccl slave stream success, hcclStreamNum =%ld", hccl_stream_num);
  return SUCCESS;
 }

@@ -167,14 +173,12 @@ Status HcclTaskInfo::CreateStream(int64_t stream_num, DavinciModel *davinci_mode
      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
      return RT_FAILED;
    }

    // Create slave stream, inactive by default, activated by hccl
    rt_ret = rtModelBindStream(davinci_model->GetRtModelHandle(), stream, RT_MODEL_WAIT_ACTIVE_STREAM);
    if (rt_ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Call rt api failed, ret: 0x%X", rt_ret);
      return RT_FAILED;
    }

    GELOGD("hccl_stream addr is=%p", stream);
    int64_t remain_cap = max_node_of_hccl_stream_ - 1;
    davinci_model->CreateHcclFollowStream(stream, remain_cap);
@@ -192,7 +196,6 @@ Status HcclTaskInfo::Distribute() {
    GELOGE(INTERNAL_ERROR, "ops kernel store is null.");
    return INTERNAL_ERROR;
  }

  OpsKernelInfoStore *ops_kernel_info_store = reinterpret_cast<OpsKernelInfoStore *>(ops_kernel_store_);
  GE_CHECK_NOTNULL(ops_kernel_info_store);
  GETaskInfo ge_task;
@@ -202,81 +205,62 @@ Status HcclTaskInfo::Distribute() {
    GELOGE(INTERNAL_ERROR, "davinci_model : load task fail, return ret: %u", result);
    return INTERNAL_ERROR;
  }

  GELOGI("HcclTaskInfo Distribute Success.");
  return SUCCESS;
 }

 Status HcclTaskInfo::SetAddrs(const std::string &hccl_type, const std::shared_ptr<OpDesc> &op_desc) {
 Status HcclTaskInfo::SetAddrs(const std::shared_ptr<OpDesc> &op_desc,
                              std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  if (HcomOmeUtil::CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
    return PARAM_INVALID;
  }
  GELOGI("Set hccl task input output address, node[%s}, type[%s] kernel_hccl_infos.size[%zu].",
         op_desc->GetName().c_str(), op_desc->GetType().c_str(), kernel_hccl_infos.size());
  if (op_desc->GetType() == HVDWAIT) {
    return SUCCESS;
  }
  domi::Status dmrt;
  hcclRedOp_t op_type;
  hcclRedOp_t op_type = HCCL_REP_OP_SUM;
  GE_CHECK_NOTNULL(davinci_model_);
  GELOGI("Calc opType[%s] input address before. Node name[%s]", op_desc->GetType().c_str(), op_desc->GetName().c_str());
  auto input_data_addr_list = ModelUtils::GetInputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
  if (!input_data_addr_list.empty()) {
    input_data_addr_ = input_data_addr_list[0];
  }

  void *output_data_addr = nullptr;
  auto output_data_addr_list = ModelUtils::GetOutputDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
  if (!output_data_addr_list.empty()) {
    output_data_addr = output_data_addr_list[0];
  }

  if (hccl_type == HCOMBROADCAST) {
    int64_t root_id;
    dmrt = HcomOmeUtil::GetHcomRootId(op_desc, root_id);
    if (dmrt != SUCCESS) {
      GELOGE(FAILED, "davinci_model: GetHcomRootId fail! domi error: %u", dmrt);
      return FAILED;
    }
    root_id_ = root_id;
  } else if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE) {
    output_data_addr_ = output_data_addr;
  } else if (hccl_type == HCOMALLREDUCE) {
    dmrt = HcomOmeUtil::GetHcomOperationType(op_desc, op_type);
    if (dmrt != SUCCESS) {
      GELOGE(FAILED, "davinci_model: GetHcomOperationType fail! domi error: %u", dmrt);
      return FAILED;
    }

    output_data_addr_ = output_data_addr;
    op_type_ = op_type;
  } else if (hccl_type == HCOMREDUCESCATTER) {
    dmrt = HcomOmeUtil::GetHcomOperationType(op_desc, op_type);
    if (dmrt != SUCCESS) {
      GELOGE(FAILED, "davinci_model: GetHcomOperationType fail! domi error: %u", dmrt);
      return FAILED;
  // initialize every kernel_hccl_info inputDataAddr
  for (size_t i = 0; i < kernel_hccl_infos.size(); i++) {
    std::string hccl_type = kernel_hccl_infos[i].hccl_type;
    void *input_data_addr = input_data_addr_list.empty() ? nullptr : input_data_addr_list[i];
    kernel_hccl_infos[i].inputDataAddr = input_data_addr;

    void *output_data_addr = output_data_addr_list.empty() ? nullptr : output_data_addr_list[i];
    if (hccl_type == HCOMALLGATHER || hccl_type == HCOMRECEIVE || hccl_type == HVDCALLBACKALLGATHER) {
      kernel_hccl_infos[i].outputDataAddr = output_data_addr;
    } else if (hccl_type == HCOMALLREDUCE || hccl_type == HCOMREDUCESCATTER || hccl_type == HVDCALLBACKALLREDUCE) {
      dmrt = HcomOmeUtil::GetHcclOperationType(op_desc, op_type);
      if (dmrt != SUCCESS) {
        GELOGE(FAILED, "davinci_model: GetHcomOperationType fail! domi error: %u", dmrt);
        return FAILED;
      }
      kernel_hccl_infos[i].outputDataAddr = output_data_addr;
      kernel_hccl_infos[i].opType = op_type;
    }

    output_data_addr_ = output_data_addr;
    op_type_ = op_type;
    davinci_model_->DisableZeroCopy(input_data_addr);
  }

  davinci_model_->DisableZeroCopy(input_data_addr_);
  return SUCCESS;
 }

 void HcclTaskInfo::TransToGETaskInfo(GETaskInfo &ge_task) {
  ge_task.id = id_;
  ge_task.type = static_cast<uint16_t>(RT_MODEL_TASK_HCCL);
  ge_task.stream = stream_;

  ge_task.kernelHcclInfo.hccl_type = hccl_type_;
  ge_task.kernelHcclInfo.inputDataAddr = input_data_addr_;
  ge_task.kernelHcclInfo.outputDataAddr = output_data_addr_;
  ge_task.kernelHcclInfo.workSpaceAddr = workspace_addr_;
  ge_task.kernelHcclInfo.count = count_;
  ge_task.kernelHcclInfo.dataType = static_cast<int32_t>(data_type_);
  ge_task.kernelHcclInfo.opType = static_cast<int32_t>(op_type_);
  ge_task.kernelHcclInfo.rootId = root_id_;
  ge_task.kernelHcclInfo.workSpaceMemSize = workspace_mem_size_;
  ge_task.kernelHcclInfo.hcclStreamList = hccl_stream_list_;

  ge_task.kernelHcclInfo = kernel_hccl_infos_;
  ge_task.privateDef = private_def_;
  ge_task.privateDefLen = private_def_len_;
  ge_task.opsKernelStorePtr = ops_kernel_store_;
  for (size_t i = 0; i < ge_task.kernelHcclInfo.size(); i++) {
    ge_task.kernelHcclInfo[i].hcclStreamList = hccl_stream_list_;
  }
 }

 void HcclTaskInfo::GetPrivateDefByTaskDef(const domi::TaskDef &task) {
  // Get privateDef and opsKernelStorePtr from taskDef and save them in taskInfo
  GELOGI("get custom info in modelTaskDef.");
@@ -299,11 +283,54 @@ void HcclTaskInfo::GetPrivateDefByTaskDef(const domi::TaskDef &task) {
        GELOGE(RT_FAILED, "Call rtMemcpy Fail, ret = 0x%X.", ret);
        return;
      }

      GELOGI("The first address of the custom info, privateDef=%p.", private_def_);
    }
  }
 }

 void HcclTaskInfo::CreateKernelHcclInfo(const ge::ConstOpDescPtr &op_desc) {
  GE_CHECK_NOTNULL_JUST_RETURN(op_desc);
  if (HcomOmeUtil::IsHCOMOp(op_desc->GetType())) {
    GETaskKernelHcclInfo kernel_hccl_info;
    kernel_hccl_infos_.emplace_back(kernel_hccl_info);
  } else if (HcomOmeUtil::IsHorovodOp(op_desc->GetType())) {
    // Horovod wait do not have any input, but create a GETaskKernelHcclInfo to record hccl_type.
    // Other Operator need to check that the number of GETaskKernelHcclInfo must equals to number of inputs
    if (op_desc->GetType() == HVDWAIT) {
      GETaskKernelHcclInfo kernel_hccl_info;
      kernel_hccl_infos_.emplace_back(kernel_hccl_info);
      return;
    }
    for (size_t i = 0; i < op_desc->GetInputsSize(); i++) {
      GETaskKernelHcclInfo kernel_hccl_info;
      kernel_hccl_infos_.emplace_back(kernel_hccl_info);
    }
  }
 }
 Status HcclTaskInfo::SetWorkspace(const std::shared_ptr<OpDesc> &op_desc,
                                  std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  GELOGI("SetWorkspace Node[%s] opType[%s] set workspace.", op_desc->GetName().c_str(), op_desc->GetType().c_str());
  uint64_t workspace_mem_size = 0;
  void *workspace_addr = nullptr;
  auto workspace_bytes = op_desc->GetWorkspaceBytes();
  if (!workspace_bytes.empty()) {
    uint64_t workspace_mem_size_tmp = workspace_bytes[0];
    GELOGI("hccl need workSpaceMemSize=%lu", workspace_mem_size_tmp);
    if (workspace_mem_size_tmp != 0) {
      workspace_mem_size = workspace_mem_size_tmp;
      vector<void *> workspace_data_addrs =
        ModelUtils::GetWorkspaceDataAddrs(davinci_model_->GetRuntimeParam(), op_desc);
      if (!workspace_data_addrs.empty()) {
        GELOGI("Get workSpaceAddr");
        workspace_addr = workspace_data_addrs[0];
      }
    }
  }
  for (size_t i = 0; i < kernel_hccl_infos.size(); i++) {
    kernel_hccl_infos[i].workSpaceMemSize = workspace_mem_size;
    kernel_hccl_infos[i].workSpaceAddr = workspace_addr;
  }
  return SUCCESS;
 }
 REGISTER_TASK_INFO(RT_MODEL_TASK_HCCL, HcclTaskInfo);
 }  // namespace ge
--- a/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/hccl_task_info.h
@@ -18,9 +18,9 @@
 #define GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_HCCL_TASK_INFO_H_

 #include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
 #include <mutex>

 #include "common/opskernel/ge_task_info.h"
 #include "graph/load/new_model_manager/task_info/task_info.h"
@@ -30,16 +30,7 @@ class HcclTaskInfo : public TaskInfo {
 public:
  HcclTaskInfo()
      : davinci_model_(nullptr),
        hccl_type_(""),
        input_data_addr_(nullptr),
        output_data_addr_(nullptr),
        count_(0),
        data_type_(HCCL_DATA_TYPE_INT8),
        op_type_(HCCL_REP_OP_SUM),
        root_id_(0),
        id_(0),
        workspace_addr_(nullptr),
        workspace_mem_size_(0),
        hccl_stream_list_(),
        ops_kernel_store_(nullptr),
        private_def_(nullptr),
@@ -56,6 +47,8 @@ class HcclTaskInfo : public TaskInfo {
 private:
  ge::Status SetAddrs(const std::string &hccl_type, const std::shared_ptr<OpDesc> &op);

  Status SetAddrs(const std::shared_ptr<OpDesc> &op_desc, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

  void TransToGETaskInfo(GETaskInfo &ge_task);

  void GetPrivateDefByTaskDef(const domi::TaskDef &task);
@@ -64,23 +57,21 @@ class HcclTaskInfo : public TaskInfo {

  ge::Status CreateStream(int64_t stream_num, DavinciModel *davinci_model);

  Status SetFollowStream(const ge::ConstOpDescPtr &op_desc, DavinciModel *davinci_model);

  void CreateKernelHcclInfo(const ge::ConstOpDescPtr &op_desc);

  Status SetWorkspace(const std::shared_ptr<OpDesc> &op_desc, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

  DavinciModel *davinci_model_;
  string hccl_type_;
  void *input_data_addr_;
  void *output_data_addr_;
  int32_t count_;
  hcclDataType_t data_type_;
  hcclRedOp_t op_type_;
  int64_t root_id_;
  uint32_t id_;
  void *workspace_addr_;
  uint64_t workspace_mem_size_;
  vector<rtStream_t> hccl_stream_list_;
  void *ops_kernel_store_;
  void *private_def_;
  uint32_t private_def_len_;
  static std::mutex hccl_follow_stream_mutex_;
  static uint32_t max_node_of_hccl_stream_;
  vector<GETaskKernelHcclInfo> kernel_hccl_infos_;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_HCCL_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.cc
@@ -51,17 +51,7 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
    GELOGE(INTERNAL_ERROR, "Init aicpu task info error, index is out of range!");
    return INTERNAL_ERROR;
  }

  if (CopyTaskInfo(kernel_ex_def, rts_param, op_desc) != SUCCESS) {
    GELOGE(FAILED, "copy task info to workspace failed.");
    return FAILED;
  }

  const vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
  if (workspace_data_addrs.empty()) {
    GELOGE(FAILED, "workspace_data_addrs is empty.");
    return FAILED;
  }
  op_desc_ = op_desc;

  // 2. Reconstruct kernelExDef.args to STR_FWK_OP_KERNEL
  STR_FWK_OP_KERNEL fwk_op_kernel = {0};
@@ -87,7 +77,52 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
    }
  }

  auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID;
  // 2.2 Collect aicpu kernel
  uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID;
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), kernel_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuKernel error.");
                  return FAILED;)

  kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL);
  if (davinci_model_->IsKnownNode()) {
    void *input_output_addr = davinci_model_->GetCurrentArgsAddr(args_offset_);
    fwk_op_kernel.fwkKernelBase.fwk_kernel.inputOutputAddr =
      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(input_output_addr));
    void *workspace_base_addr = nullptr;
    rtError_t rt_ret = rtMalloc(&workspace_base_addr, kernel_ex_def.task_info_size(), RT_MEMORY_HBM);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc error, ret: Ox%X", rt_ret); return FAILED;);
    rt_ret = rtMemcpy(workspace_base_addr, kernel_ex_def.task_info_size(), kernel_ex_def.task_info().data(),
                      kernel_ex_def.task_info_size(), RT_MEMCPY_HOST_TO_DEVICE);
    fwk_op_kernel.fwkKernelBase.fwk_kernel.workspaceBaseAddr =
      static_cast<uint64_t>(reinterpret_cast<uintptr_t>(workspace_base_addr));
    fwk_op_kernel.fwkKernelBase.fwk_kernel.stepIDAddr = step_id_addr;
    fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoNum = 0;
    fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = 0;

    rt_ret = rtMalloc(&kernel_buf_, kernel_buf_size_, RT_MEMORY_HBM);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc error: 0x%X", rt_ret); return FAILED;)

    rt_ret = rtMemcpy(kernel_buf_, kernel_buf_size_, static_cast<void *>(&fwk_op_kernel), kernel_buf_size_,
                      RT_MEMCPY_HOST_TO_DEVICE);
    GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMemcpy error, ret: Ox%X", rt_ret); return FAILED;)

    GELOGI("KernelExTaskInfo knonw node Init Success.");
    return SUCCESS;
  }

  // 3. Set workspaceaddr, inputOutputDataAddr
  if (CopyTaskInfo(kernel_ex_def, rts_param, op_desc) != SUCCESS) {
    GELOGE(FAILED, "copy task info to workspace failed.");
    return FAILED;
  }

  const vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc);
  if (workspace_data_addrs.empty()) {
    GELOGE(FAILED, "workspace_data_addrs is empty.");
    return FAILED;
  }

  uint64_t workspace_base_addr = reinterpret_cast<uint64_t>(reinterpret_cast<uintptr_t>(workspace_data_addrs[0]));
  const vector<void *> input_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc);
  const vector<void *> output_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc);
@@ -106,8 +141,7 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin

    if (PropertiesManager::Instance().IsLayerNeedDump(davinci_model_->Name(), op_desc->GetName())) {
      dump_flag_ = RT_KERNEL_DUMPFLAG;
      dump_args_ =
        reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(input_output_addr_) + sizeof(void *) * input_addrs.size());
      dump_args_ = input_output_addr_;
    }
  }

@@ -119,16 +153,10 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
  fwk_op_kernel.fwkKernelBase.fwk_kernel.extInfoAddr = 0;

  // 4. Create session
  auto session_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.sessionID;
  GE_CHECK_NOTNULL(ModelManager::GetInstance());
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuSession(session_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuSession error. session id: %lu", session_id);
                  return FAILED;)
  // 4.1 Collect aicpu kernel
  uint64_t kernel_id = fwk_op_kernel.fwkKernelBase.fwk_kernel.kernelID;
  GE_IF_BOOL_EXEC(ModelManager::GetInstance()->CreateAicpuKernel(session_id, davinci_model->Id(), kernel_id) != SUCCESS,
                  GELOGE(FAILED, "CreateAicpuKernel error.");
                  return FAILED;)
  // 5. Return result
  rtError_t rt_ret = rtMalloc(&kernel_buf_, sizeof(STR_FWK_OP_KERNEL), RT_MEMORY_HBM);
  GE_IF_BOOL_EXEC(rt_ret != RT_ERROR_NONE, GELOGE(rt_ret, "rtMalloc error: 0x%X", rt_ret); return FAILED;)
@@ -144,12 +172,46 @@ Status KernelExTaskInfo::Init(const domi::TaskDef &task_def, DavinciModel *davin
  virtual_io_addrs.insert(virtual_io_addrs.end(), virtual_out_addrs.begin(), virtual_out_addrs.end());
  davinci_model_->SetZeroCopyAddr(op_desc, virtual_io_addrs, io_addrs.data(), input_output_addr_, addrs_size, 0);

  kernel_buf_size_ = sizeof(STR_FWK_OP_KERNEL);

  GELOGI("KernelExTaskInfo Init Success. session id: %lu", session_id);
  return SUCCESS;
 }

 Status KernelExTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  auto kernel_ex_def = task_def.kernel_ex();
  uint32_t op_index = kernel_ex_def.op_index();
  OpDescPtr op_desc = davinci_model->GetOpByIndex(op_index);
  if (op_desc == nullptr) {
    GELOGE(INTERNAL_ERROR, "Init aicpu task info error, index is out of range!");
    return INTERNAL_ERROR;
  }
  args_offset_ = davinci_model->GetTotalArgsSize();
  const size_t inputs_size = op_desc->GetInputsSize();
  const size_t outputs_size = op_desc->GetOutputsSize();
  // aicpu kernel input/output size
  size_t mem_length = inputs_size + outputs_size;
  uint32_t mem_size = sizeof(uint64_t) * mem_length;
  davinci_model->SetTotalArgsSize(mem_size);
  GELOGI("kernel task name %s, args_size %u, args_offset %u", op_desc->GetName().c_str(), mem_size, args_offset_);
  return SUCCESS;
 }

 Status KernelExTaskInfo::UpdateArgs() {
  GELOGI("KernelExTaskInfo::UpdateArgs in.");
  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
  vector<void *> io_addrs;
  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);

  io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
  io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());

  GE_CHK_STATUS_RET(davinci_model_->UpdateKnownZeroCopyAddr(io_addrs, args_offset_),
                    "update known node %s zero copy addr failed.", op_desc_->GetName().c_str());

  GELOGI("KernelExTaskInfo::UpdateArgs success.");
  return SUCCESS;
 }

 Status KernelExTaskInfo::CopyTaskInfo(const domi::KernelExDef &kernel_def, const RuntimeParam &rts_param,
                                      const OpDescPtr &op_desc) {
  // Userspace copy need virtual address.
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_ex_task_info.h
@@ -41,6 +41,10 @@ class KernelExTaskInfo : public TaskInfo {

  Status Release() override;

  Status UpdateArgs() override;

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

  uint32_t GetTaskID() override { return task_id_; }

  uint32_t GetStreamId() override { return stream_id_; }
@@ -61,6 +65,8 @@ class KernelExTaskInfo : public TaskInfo {
  void *kernel_buf_;
  void *input_output_addr_;
  void *dump_args_;
  OpDescPtr op_desc_ = nullptr;
  uint32_t args_offset_ = 0;
 };
 }  // namespace ge
 #endif  // GE_GRAPH_LOAD_NEW_MODEL_MANAGER_TASK_INFO_KERNEL_EX_TASK_INFO_H_
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.cc
@@ -343,6 +343,10 @@ Status KernelTaskInfo::SuperKernelDistribute() {

 Status KernelTaskInfo::Distribute() {
  GELOGD("KernelTaskInfo Distribute Start.");
  if (davinci_model_->IsKnownNode()) {
    args_ = davinci_model_->GetCurrentArgsAddr(args_offset_);
    GELOGI("Known node %s args addr %p, offset %u.", op_desc_->GetName().c_str(), args_, args_offset_);
  }
  rtError_t rt_ret = RT_ERROR_NONE;
  char *skt_enable_env = getenv("SKT_ENABLE");
  int64_t env_flag = (skt_enable_env != nullptr) ? strtol(skt_enable_env, nullptr, 10) : 0;
@@ -380,7 +384,29 @@ Status KernelTaskInfo::Distribute() {
  return SUCCESS;
 }

 Status KernelTaskInfo::UpdateArgs() {
  GELOGI("KernelTaskInfo::UpdateArgs in.");
  const RuntimeParam &rts_param = davinci_model_->GetRuntimeParam();
  vector<void *> input_data_addrs = ModelUtils::GetInputDataAddrs(rts_param, op_desc_);
  vector<void *> output_data_addrs = ModelUtils::GetOutputDataAddrs(rts_param, op_desc_);
  vector<void *> workspace_data_addrs = ModelUtils::GetWorkspaceDataAddrs(rts_param, op_desc_);

  vector<void *> io_addrs;
  io_addrs.insert(io_addrs.end(), input_data_addrs.begin(), input_data_addrs.end());
  io_addrs.insert(io_addrs.end(), output_data_addrs.begin(), output_data_addrs.end());
  io_addrs.insert(io_addrs.end(), workspace_data_addrs.begin(), workspace_data_addrs.end());

  GE_CHK_STATUS_RET(davinci_model_->UpdateKnownZeroCopyAddr(io_addrs, args_offset_),
                    "update known node %s zero copy addr failed.", op_desc_->GetName().c_str());

  GELOGI("KernelTaskInfo::UpdateArgs success.");
  return SUCCESS;
 }

 Status KernelTaskInfo::Release() {
  if (davinci_model_ != nullptr && davinci_model_->IsKnownNode()) {
    return SUCCESS;
  }
  FreeRtMem(&args_);
  FreeRtMem(&flowtable_);
  FreeRtMem(&custom_info_.input_descs);
@@ -439,6 +465,15 @@ Status KernelTaskInfo::UpdateL2Data(const domi::KernelDef &kernel_def) {
  return SUCCESS;
 }

 Status KernelTaskInfo::CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) {
  domi::KernelDef kernel_def = task_def.kernel();
  uint32_t args_size = kernel_def.args_size();
  args_offset_ = davinci_model->GetTotalArgsSize();
  davinci_model->SetTotalArgsSize(args_size);
  GELOGI("kernel task name , args_size %u, args_offset %u", args_size, args_offset_);
  return SUCCESS;
 }

 Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kernel_def) {
  GELOGD("Do InitTVMTask.");
  GE_CHECK_NOTNULL(davinci_model_);
@@ -448,6 +483,9 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne
    GELOGE(INTERNAL_ERROR, "InitTVMTaskInfo error, index:%u out of range!", ctx_.opIndex);
    return INTERNAL_ERROR;
  }
  if (davinci_model_->IsKnownNode()) {
    return SUCCESS;
  }

  // Update Stub
  // When training, when the the second call to DavinciModel::init() comes here, stub_func_ is already valid,
@@ -512,7 +550,7 @@ Status KernelTaskInfo::InitTVMTask(uint16_t offset, const domi::KernelDef &kerne

  if (PropertiesManager::Instance().IsLayerNeedDump(davinci_model_->Name(), op_desc->GetName())) {
    dump_flag_ = RT_KERNEL_DUMPFLAG;
    dump_args_ = static_cast<char *>(args_) + offset + kAddrLen * input_data_addrs.size();
    dump_args_ = static_cast<char *>(args_) + offset;
  }

  // update origin l2 data
@@ -771,7 +809,7 @@ Status KernelTaskInfo::InitAicpuTask(uint32_t op_index, const domi::KernelDef &k

  if (PropertiesManager::Instance().IsLayerNeedDump(davinci_model_->Name(), op_desc->GetName())) {
    dump_flag_ = RT_KERNEL_DUMPFLAG;
    dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead) + kAddrLen * input_addrs.size();
    dump_args_ = static_cast<char *>(args_) + sizeof(aicpu::AicpuParamHead);
  }

  vector<void *> virtual_io_addrs;  // use virtual address for zero copy key.
--- a/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/kernel_task_info.h
@@ -67,6 +67,10 @@ class KernelTaskInfo : public TaskInfo {

  Status Distribute() override;

  Status UpdateArgs() override;

  Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) override;

  Status Release() override;

  cce::ccOpContext *GetCtx() override { return &ctx_; }
@@ -146,6 +150,7 @@ class KernelTaskInfo : public TaskInfo {
  void *dump_args_;
  OpDescPtr op_desc_;
  DavinciModel *davinci_model_;
  uint32_t args_offset_ = 0;

  // For super kernel
  uint32_t skt_id_;
--- a/src/ge/graph/load/new_model_manager/task_info/task_info.h
+++ b/src/ge/graph/load/new_model_manager/task_info/task_info.h
@@ -62,6 +62,10 @@ class TaskInfo {

  virtual Status Distribute() = 0;

  virtual Status UpdateArgs() { return SUCCESS; }

  virtual Status CalculateArgs(const domi::TaskDef &task_def, DavinciModel *davinci_model) { return SUCCESS; }

  virtual Status Release() { return SUCCESS; }

  virtual cce::ccOpContext *GetCtx() { return nullptr; }
--- a/src/ge/graph/manager/graph_caching_allocator.cc
+++ b/src/ge/graph/manager/graph_caching_allocator.cc
@@ -0,0 +1,343 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #include "graph/manager/graph_caching_allocator.h"

 #include <set>
 #include <string>
 #include <utility>

 #include "framework/common/debug/ge_log.h"
 #include "graph/manager/graph_mem_allocator.h"

 namespace ge {
 const size_t bin_ranges[kNumBins] = {kRoundBlockSize * kKByteSize,
                                     8 * kMByteSize,
                                     32 * kMByteSize,
                                     128 * kMByteSize,
                                     kGByteSize,
                                     4 * kGByteSize,
                                     16 * kGByteSize,
                                     26 * kGByteSize};

 static bool BlockComparator(const Block *left, const Block *right) {
  if (left->device_id != right->device_id) {
    return left->device_id < right->device_id;
  }
  if (left->size != right->size) {
    return left->size < right->size;
  }
  return reinterpret_cast<uintptr_t>(left->ptr) < reinterpret_cast<uintptr_t>(right->ptr);
 }

 bool CanMerge(Block *block) {
  if (block == nullptr || block->allocated || !block->IsSplit()) {
    return false;
  }
  return true;
 }

 size_t GetBinIndex(size_t size) {
  size_t index = 0;
  for (auto range : bin_ranges) {
    if (size <= range) {
      break;
    }
    ++index;
  }
  if (index > kNumBins - 1) {
    index = kNumBins - 1;
  }
  return index;
 }

 size_t GetAllocationSize(size_t size) {
  size_t index = GetBinIndex(size);
  return bin_ranges[index];
 }

 ///
 /// @ingroup ge_graph
 /// @brief block size based on alignment
 /// @param [in] original malloc size
 /// @return allocation size
 ///
 size_t GetBlockSize(size_t size) {
  if (size == 0) {
    return kRoundBlockSize;
  }
  return kRoundBlockSize * ((size + kRoundBlockSize - 1) / kRoundBlockSize);
 }

 bool ShouldSplit(const Block *block, size_t size) {
  return static_cast<double>(size) <= (static_cast<double>(block->size) * kSplitThreshold);
 }

 CachingAllocator::CachingAllocator(rtMemType_t memory_type) : memory_type_(memory_type), memory_allocator_(nullptr) {
  for (uint32_t i = 0; i < kNumBins; ++i) {
    free_block_bins_[i] = nullptr;
  }
 }

 Status CachingAllocator::Initialize(uint32_t device_id) {
  GELOGI("Device id %u", device_id);
  // when redo Initialize free old memory
  FreeBlocks();
  std::lock_guard<std::recursive_mutex> lock(mutex_);
  for (uint32_t i = 0; i < kNumBins; ++i) {
    if (free_block_bins_[i] != nullptr) {
      continue;
    }
    auto bin_ptr = new (std::nothrow) BlockBin(BlockComparator);
    if (bin_ptr == nullptr) {
      GELOGE(ge::FAILED, "Alloc BlockBin failed.");
      return ge::FAILED;
    }
    free_block_bins_[i] = bin_ptr;
  }
  memory_allocator_ = MemManager::Instance(memory_type_);
  if (memory_allocator_ == nullptr) {
    return ge::FAILED;
  }
  return ge::SUCCESS;
 }

 void CachingAllocator::Finalize(uint32_t device_id) {
  GELOGI("Device id %u", device_id);
  FreeBlocks();
  FreeBlockBins();
 }

 uint8_t *CachingAllocator::Malloc(size_t size, uint8_t *org_ptr, uint32_t device_id) {
  uint8_t *ptr = nullptr;
  size = GetBlockSize(size);
  Block *block = FindFreeBlock(size, org_ptr, device_id);
  if (block != nullptr) {
    ptr = block->ptr;
  } else {
    if (ge::SUCCESS == TryExtendCache(size, device_id)) {
      block = FindFreeBlock(size, org_ptr, device_id);
      if (block != nullptr) {
        ptr = block->ptr;
      }
    }
  }
  if (ptr == nullptr) {
    GELOGE(FAILED, "Malloc failed device id = %u, size= %zu", device_id, size);
  } else {
    std::lock_guard<std::recursive_mutex> lock(mutex_);
    block->allocated = true;
    allocated_blocks_[block->ptr] = block;
    GELOGI("Malloc device id = %u, size= %zu", device_id, size);
  }
  return ptr;
 }

 Status CachingAllocator::Free(uint8_t *ptr, uint32_t device_id) {
  GELOGI("Free device id = %u", device_id);
  if (ptr == nullptr) {
    GELOGE(PARAM_INVALID, "Invalid memory pointer");
    return ge::PARAM_INVALID;
  }

  std::lock_guard<std::recursive_mutex> lock(mutex_);
  auto it = allocated_blocks_.find(ptr);
  if (it == allocated_blocks_.end()) {
    GELOGE(PARAM_INVALID, "Invalid memory pointer");
    return ge::PARAM_INVALID;
  }
  Block *block = it->second;
  allocated_blocks_.erase(it);
  FreeBlock(block);
  return ge::SUCCESS;
 }

 void CachingAllocator::FreeBlock(Block *block) {
  if (block == nullptr || !block->allocated) {
    return;
  }
  GELOGI("Free block size = %zu", block->size);

  std::lock_guard<std::recursive_mutex> lock(mutex_);
  block->allocated = false;
  auto &bin = *block->bin;
  Block *merge_blocks[] = {block->prev, block->next};
  for (Block *merge_block : merge_blocks) {
    MergeBlocks(block, merge_block, bin);
  }
  bin.insert(block);
 }

 void CachingAllocator::MergeBlocks(Block *dst, Block *src, BlockBin &bin) {
  if (!CanMerge(dst) || !CanMerge(src)) {
    return;
  }

  if (dst->prev == src) {
    dst->ptr = src->ptr;
    dst->prev = src->prev;
    if (dst->prev != nullptr) {
      dst->prev->next = dst;
    }
  } else {
    dst->next = src->next;
    if (dst->next != nullptr) {
      dst->next->prev = dst;
    }
  }

  dst->size += src->size;
  bin.erase(src);
  delete src;
 }

 BlockBin *CachingAllocator::GetBlockBin(size_t size) {
  size_t index = GetBinIndex(size);
  return free_block_bins_[index];
 }

 Block *CachingAllocator::FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t device_id) {
  // org_ptr - 1, try to find ptr same as org_ptr
  Block key(device_id, size, (org_ptr == nullptr ? nullptr : org_ptr - 1));
  BlockBin *bin = GetBlockBin(size);
  if (bin == nullptr) {
    GELOGE(ge::FAILED, "Get block bin failed size = %zu", size);
    return nullptr;
  }
  std::lock_guard<std::recursive_mutex> lock(mutex_);
  auto it = bin->lower_bound(&key);
  if (it != bin->end()) {
    Block *block = *it;
    bin->erase(it);
    if (block != nullptr) {
      GELOGI("Find block size = %zu", block->size);
      if (ShouldSplit(block, size)) {
        return SplitBlock(block, size, *bin, device_id);
      }
    }
    return block;
  }
  return nullptr;
 }

 Block *CachingAllocator::SplitBlock(Block *block, size_t size, BlockBin &bin, uint32_t device_id) {
  // block has been checked, should not be nullptr
  Block *remaining = block;
  Block *new_block = new (std::nothrow) Block(device_id, size, &bin, block->ptr);
  if (new_block == nullptr) {
    GELOGE(ge::FAILED, "Alloc block failed size = %zu", size);
    return block;
  }
  new_block->prev = remaining->prev;
  if (new_block->prev != nullptr) {
    new_block->prev->next = new_block;
  }
  new_block->next = remaining;
  remaining->prev = new_block;
  remaining->ptr = remaining->ptr + size;
  remaining->size -= size;
  bin.insert(remaining);
  return new_block;
 }

 Status CachingAllocator::TryExtendCache(size_t size, uint32_t device_id) {
  auto memory_size = GetAllocationSize(size);
  const std::string purpose = "Memory for caching.";
  auto memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id);
  // try to free caches and malloc again when malloc memory failed
  if (memory_addr == nullptr) {
    FreeCachedBlocks();
    memory_addr = memory_allocator_->MallocMemory(purpose, memory_size, device_id);
    if (memory_addr == nullptr) {
      GELOGE(ge::FAILED, "TryExtendCache failed, no enough memory for size = %zu, device_id = %u", memory_size,
             device_id);
      return ge::FAILED;
    }
  }
  if (AddToBlockBin(memory_addr, memory_size) != ge::SUCCESS) {
    (void)memory_allocator_->FreeMemory(memory_addr);
    return ge::FAILED;
  }
  return ge::SUCCESS;
 }

 Status CachingAllocator::AddToBlockBin(uint8_t *ptr, size_t size) {
  BlockBin *bin = GetBlockBin(size);
  if (bin == nullptr) {
    GELOGE(ge::FAILED, "Get block bin failed size = %zu", size);
    return ge::FAILED;
  }
  Block *block = new (std::nothrow) Block(0, size, bin, nullptr);
  if (block == nullptr) {
    GELOGE(ge::FAILED, "Alloc block failed size = %zu", size);
    return ge::FAILED;
  }

  GELOGI("Block size = %zu", size);
  block->ptr = ptr;
  block->size = size;

  std::lock_guard<std::recursive_mutex> lock(mutex_);
  bin->insert(block);
  return ge::SUCCESS;
 }

 void CachingAllocator::FreeCachedBlocks() {
  GELOGI("Free cached blocks");
  std::lock_guard<std::recursive_mutex> lock(mutex_);
  for (uint32_t i = 0; i < kNumBins; ++i) {
    auto pool = free_block_bins_[i];
    if (pool == nullptr) {
      continue;
    }
    for (auto it = pool->begin(); it != pool->end();) {
      Block *block = *it;
      // free block memory that has not been split
      if ((block != nullptr) && (block->ptr != nullptr) && (block->prev == nullptr) && (block->next == nullptr) &&
          (memory_allocator_->FreeMemory(block->ptr) == ge::SUCCESS)) {
        pool->erase(it++);
        delete block;
        continue;
      }
      ++it;
    }
  }
 }

 void CachingAllocator::FreeBlocks() {
  GELOGI("Free blocks");
  std::lock_guard<std::recursive_mutex> lock(mutex_);
  // free allocated blocks and put to cache
  for (auto &it : allocated_blocks_) {
    FreeBlock(it.second);
  }
  allocated_blocks_.clear();

  FreeCachedBlocks();
 }

 void CachingAllocator::FreeBlockBins() {
  GELOGI("Free block bins");
  std::lock_guard<std::recursive_mutex> lock(mutex_);
  for (uint32_t i = 0; i < kNumBins; ++i) {
    if (free_block_bins_[i] != nullptr) {
      delete free_block_bins_[i];
      free_block_bins_[i] = nullptr;
    }
  }
 }

 }  // namespace ge
--- a/src/ge/graph/manager/graph_caching_allocator.h
+++ b/src/ge/graph/manager/graph_caching_allocator.h
@@ -0,0 +1,212 @@
 /**
 * Copyright 2019-2020 Huawei Technologies Co., Ltd
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

 #ifndef GE_GRAPH_MANAGER_GRAPH_CACHING_ALLOCATOR_H_
 #define GE_GRAPH_MANAGER_GRAPH_CACHING_ALLOCATOR_H_

 #include <iostream>
 #include <map>
 #include <memory>
 #include <mutex>
 #include <string>
 #include <vector>
 #include <set>
 #include <unordered_map>
 #include <unordered_set>

 #include "framework/common/ge_inner_error_codes.h"
 #include "graph/node.h"
 #include "runtime/mem.h"

 namespace ge {

 constexpr size_t kRoundBlockSize = 512;   // all block sizes are rounded to at least 512 bytes
 constexpr double kSplitThreshold = 0.75;  // split when malloc size <= small block size * kSpliThreshold
 constexpr size_t kKByteSize = 1024;
 constexpr size_t kMByteSize = 1024 * 1024;
 constexpr size_t kGByteSize = 1024 * 1024 * 1024;

 struct Block;
 typedef bool (*Comparison)(const Block *, const Block *);
 using BlockBin = std::set<Block *, Comparison>;
 static const uint32_t kNumBins = 8;

 struct Block {
  uint32_t device_id;  // npu device id
  size_t size;         // block size in bytes
  BlockBin *bin;       // owning block bin
  uint8_t *ptr;        // memory address
  bool allocated;      // in-use flag
  Block *prev;         // prev block if split from a larger allocation
  Block *next;         // next block if split from a larger allocation

  Block(uint32_t device, size_t size, BlockBin *bin, uint8_t *ptr)
      : device_id(device), size(size), bin(bin), ptr(ptr), allocated(0), prev(nullptr), next(nullptr) {}

  // constructor for search key
  Block(uint32_t device, size_t size, uint8_t *ptr)
      : device_id(device), size(size), bin(nullptr), ptr(ptr), allocated(0), prev(nullptr), next(nullptr) {}

  bool IsSplit() const { return (prev != nullptr) || (next != nullptr); }
 };

 class MemoryAllocator;

 class CachingAllocator {
 public:
  explicit CachingAllocator(rtMemType_t memory_type);

  virtual ~CachingAllocator() = default;

  ///
  /// @ingroup ge_graph
  /// @brief caching allocator init
  /// @param [in] device id
  /// @return Status of init
  ///
  Status Initialize(uint32_t device_id = 0);

  ///
  /// @ingroup ge_graph
  /// @brief memory allocator finalize, release cached memory
  /// @return void
  ///
  void Finalize(uint32_t device_id = 0);

  ///
  /// @ingroup ge_graph
  /// @brief malloc memory
  /// @param [in] size memory size
  /// @param [in] try to reuse the same memory
  /// @param [in] device id
  /// @return  memory address
  ///
  uint8_t *Malloc(size_t size, uint8_t *org_ptr = nullptr, uint32_t device_id = 0);

  ///
  /// @ingroup ge_graph
  /// @brief free memory
  /// @param [in] device_id device id
  /// @param [out] memory_ptr memory address ptr
  /// @return Status result of function
  ///
  Status Free(uint8_t *memory_addr, uint32_t device_id = 0);

 private:
  ///
  /// @ingroup ge_graph
  /// @brief extend cache by size
  /// @param [in] memory size
  /// @param [in] device id
  /// @return Status result of function
  ///
  Status TryExtendCache(size_t size, uint32_t device_id);

  ///
  /// @ingroup ge_graph
  /// @brief find free block by size
  /// @param [in] memory size
  /// @param [in] device_id device id
  /// @return block ptr
  ///
  Block *FindFreeBlock(size_t size, uint8_t *org_ptr, uint32_t device_id);

  ///
  /// @ingroup ge_graph
  /// @brief get the right bin based on size
  /// @param [in] original malloc size
  /// @return block bin
  ///
  BlockBin *GetBlockBin(size_t size);

  ///
  /// @ingroup ge_graph
  /// @brief add memory to right bin based on size
  /// @param [in] memory ptr
  /// @param [in] memory size
  /// @return Status result of function
  ///
  Status AddToBlockBin(uint8_t *ptr, size_t size);

  ///
  /// @ingroup ge_graph
  /// @brief free block to right bin
  /// @param [in] block ptr
  /// @return void
  ///
  void FreeBlock(Block *block);

  ///
  /// @ingroup ge_graph
  /// @brief free all cached blocks to right bin and release the memory when memory is not enough
  /// @return void
  ///
  void FreeCachedBlocks();

  ///
  /// @ingroup ge_graph
  /// @brief free allocated and cached blocks and release the memory when process exit
  /// @return void
  ///
  void FreeBlocks();

  ///
  /// @ingroup ge_graph
  /// @brief free block bins when process exit
  /// @return void
  ///
  void FreeBlockBins();

  ///
  /// @ingroup ge_graph
  /// @brief If a split block is freed, try merging with the original block
  /// @param [inout] dest block ptr
  /// @param [in] src block ptr
  /// @param [out] block bin
  /// @return void
  ///
  void MergeBlocks(Block *dst, Block *src, BlockBin &bin);

  ///
  /// @ingroup ge_graph
  /// @brief If the allocated memory size is too much smaller than the memory block, try to split the memory block
  /// @param [in] original block ptr
  /// @param [in] allocated memory size
  /// @param [in] block bin
  /// @param [in] device id
  /// @return splited block ptr
  ///
  Block *SplitBlock(Block *block, size_t size, BlockBin &bin, uint32_t device_id);

 private:
  rtMemType_t memory_type_;

  // device memory allocator
  MemoryAllocator *memory_allocator_;

  // lock around all operations
  mutable std::recursive_mutex mutex_;

  // allocated blocks by memory pointer
  std::unordered_map<uint8_t *, Block *> allocated_blocks_;

  // block bins by different block size
  BlockBin *free_block_bins_[kNumBins];
 };

 };  // namespace ge

 #endif  // GE_GRAPH_MANAGER_GRAPH_CACHING_ALLOCATOR_H_
--- a/src/ge/graph/manager/graph_manager.cc
+++ b/src/ge/graph/manager/graph_manager.cc
--- a/src/ge/graph/manager/graph_manager.h
+++ b/src/ge/graph/manager/graph_manager.h
@@ -99,7 +99,7 @@ class GraphManager {
  /// @param [out] models build result
  /// @return Status result of function
  ///
  Status BuildGraph(const GraphId &graph_id, const std::vector<GeTensor> &inputs, vector<GeModelPtr> &models);
  ge::Status BuildGraph(const GraphId &graph_id, const std::vector<GeTensor> &inputs, GeRootModelPtr &models);

  ///
  /// @ingroup ge_graph
@@ -153,6 +153,8 @@ class GraphManager {

  const std::map<std::string, std::string> *GetGraphOptions(uint32_t graph_id);

  void SetOptionsRunGraphFlag(bool run_graph_flag);

 private:
  struct PreRunArgs {
    GraphId graph_id;
@@ -166,7 +168,7 @@ class GraphManager {
    GraphNodePtr graph_node;
    GraphId graph_id;
    std::vector<ge::InputTensorInfo> input_tensor;
    GeModelPtr ge_model;
    GeRootModelPtr ge_root_model;
    GEThreadLocalContext context;
    RunAsyncCallback callback;
  };
@@ -177,19 +179,16 @@ class GraphManager {

  static Status ProcessSubGraphWithMultiThreads(GraphManager *graph_manager, const SubGraphInfoPtr &sub_graph_info_ptr,
                                                uint64_t session_id, const GEThreadLocalContext &ge_context);
  Status PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs, vector<GeModelPtr> &ge_models,
                GeModelPtr &ge_model, uint64_t session_id = INVALID_SESSION_ID);

  Status PreRunDynShape(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
                        vector<GeModelPtr> &ge_models, GeModelPtr &ge_model, uint64_t session_id = INVALID_SESSION_ID);
  Status PreRun(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs, GeRootModelPtr &ge_root_model,
                uint64_t session_id = INVALID_SESSION_ID);

  Status OptimizeSubgraph(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, uint64_t session_id);

  Status Build(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, vector<GeModelPtr> &ge_models,
               GeModelPtr &ge_model, uint64_t session_id);
  Status Build(const GraphNodePtr &graph_node, ComputeGraphPtr &compute_graph, GeRootModelPtr &ge_root_model,
               uint64_t session_id);

  Status StartForRunGraph(const GraphNodePtr &graph_node, const std::vector<GeTensor> &inputs,
                          vector<GeModelPtr> &ge_models, uint64_t session_id = INVALID_SESSION_ID);
                          GeRootModelPtr &ge_root_model, uint64_t session_id = INVALID_SESSION_ID);

  Status InnerRunGraph(GraphNodePtr &graph_node, const GraphId &graph_id, const std::vector<GeTensor> &inputs,
                       std::vector<GeTensor> &outputs);
@@ -240,6 +239,8 @@ class GraphManager {

  Status SetSubgraph(uint64_t session_id, ComputeGraphPtr compute_graph);

  void SetAttrForHcomBroadCastOp(ge::ComputeGraphPtr &compute_graph);

  bool IsBroadCastOpData(const ge::NodePtr &var_node);

  void AdjustBroadCastOpData(const ge::NodePtr &var_node);
@@ -258,6 +259,7 @@ class GraphManager {
  std::shared_ptr<GraphContext> GetGraphContext() const { return graph_context_; }

  Status RemoveIsolatedConst(ge::ComputeGraphPtr &compute_graph);
  Status RemoveIsolatedConstInThisGraph(ge::ComputeGraphPtr &compute_graph);

  Status OptimizeStage1(ComputeGraphPtr &compute_graph);
  Status OptimizeStage2(ComputeGraphPtr &compute_graph);
@@ -265,13 +267,13 @@ class GraphManager {

  Status NewOptimizeAfterMergeSubGraph(ge::ComputeGraphPtr &compute_graph);

  Status LoadGraphAsync(const GeModelPtr &ge_model, const GraphNodePtr &graph_node);
  Status LoadGraphAsync(const GeRootModelPtr &ge_root_model, const GraphNodePtr &graph_node);

  Status CheckAndReleaseMemory(const GeModelPtr &ge_model, const GraphNodePtr &graph_node);

  bool CheckModelLoad(const GeModelPtr &ge_model, bool load_flag);
  bool CheckModelLoad(const GeRootModelPtr &ge_model, bool load_flag);

  Status LoadGraph(const GeModelPtr &ge_model, const GraphNodePtr &graph_node);
  Status LoadGraph(const GeRootModelPtr &ge_root_model, const GraphNodePtr &graph_node);

  bool IsGraphNeedBuild(const GraphNodePtr &graph_node);

@@ -287,6 +289,8 @@ class GraphManager {
  static void StopQueue(GraphManager *graph_manager);
  static void ReturnError(GraphManager *graph_manager, RunAsyncCallback callback, Status ret, const string &log);

  void ChangeConstTypeWhenTraining(const ComputeGraphPtr &compute_graph);

  std::atomic_bool thread_run_flag_;
  BlockingQueue<PreRunArgs> prerun_args_q_{};
  BlockingQueue<RunArgs> run_args_q_{};
--- a/src/ge/graph/manager/graph_manager_utils.h
+++ b/src/ge/graph/manager/graph_manager_utils.h
@@ -36,6 +36,7 @@
 #include "graph/graph.h"
 #include "graph/model.h"
 #include "model/ge_model.h"
 #include "model/ge_root_model.h"
 #include "register/register_fmk_types.h"
 #include "external/ge/ge_api_types.h"

@@ -160,6 +161,8 @@ class GraphNode {
  void SetLoadFlag(bool load_flag) { load_flag_ = load_flag; }
  void SetGeModel(const GeModelPtr &ge_model) { ge_model_ = ge_model; }
  GeModelPtr GetGeModel() const { return ge_model_; }
  void SetGeRootModel(const GeRootModelPtr &ge_root_model) { ge_root_model_ = ge_root_model; }
  GeRootModelPtr GetGeRootModel() const { return ge_root_model_; }
  const std::map<std::string, std::string> &GetOptions() const { return options_; }
  void SetOptions(const std::map<std::string, std::string> &options) { options_ = options; }
  void Lock();
@@ -179,6 +182,7 @@ class GraphNode {
  bool build_flag_;
  bool load_flag_;
  GeModelPtr ge_model_;
  GeRootModelPtr ge_root_model_;
  BlockingQueue<uint8_t> sem_;
 };

--- a/src/ge/graph/manager/graph_mem_allocator.cc
+++ b/src/ge/graph/manager/graph_mem_allocator.cc
@@ -15,6 +15,7 @@
 */

 #include "graph/manager/graph_mem_allocator.h"
 #include "graph/manager/graph_caching_allocator.h"

 #include <set>
 #include <string>
@@ -47,7 +48,7 @@ void MemoryAllocator::Finalize(uint32_t device_id) {
  memory_base_map_.clear();
 }

 uint8_t *MemoryAllocator::MallocMemory(const string &purpose, uint64_t memory_size, uint32_t device_id) const {
 uint8_t *MemoryAllocator::MallocMemory(const string &purpose, size_t memory_size, uint32_t device_id) const {
  uint8_t *memory_addr = nullptr;

  if (rtMalloc(reinterpret_cast<void **>(&memory_addr), memory_size, memory_type_) != RT_ERROR_NONE) {
@@ -74,7 +75,7 @@ Status MemoryAllocator::FreeMemory(uint8_t *memory_addr, uint32_t device_id) con
  return ge::SUCCESS;
 }

 uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memory_key, uint64_t memory_size,
 uint8_t *MemoryAllocator::MallocMemory(const string &purpose, const string &memory_key, size_t memory_size,
                                       uint32_t device_id) {
  auto it = memory_base_map_.find(memory_key);
  if (it != memory_base_map_.end()) {
@@ -147,7 +148,7 @@ uint8_t *MemoryAllocator::GetMemoryAddr(const string &memory_key, uint32_t devic
  return it->second.memory_addr_;
 }

 MemManager::MemManager() : default_memory_allocator_(nullptr) {}
 MemManager::MemManager() {}

 MemManager::~MemManager() { Finalize(); }

@@ -159,7 +160,7 @@ MemManager &MemManager::Instance() {
 MemoryAllocator *MemManager::Instance(rtMemType_t memory_type) { return Instance().GetMemoryAllocator(memory_type); }

 Status MemManager::Initialize(const std::vector<rtMemType_t> &memory_type) {
  std::lock_guard<std::mutex> lock(allocator_mutex_);
  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
  MemoryAllocator *memory_allocator = nullptr;
  for (unsigned int index : memory_type) {
    auto it = memory_allocator_map_.find(index);
@@ -184,34 +185,34 @@ Status MemManager::Initialize(const std::vector<rtMemType_t> &memory_type) {
    }
  }

  default_memory_allocator_ = new (std::nothrow) MemoryAllocator(RT_MEMORY_RESERVED);
  if (default_memory_allocator_ == nullptr) {
    GELOGE(ge::INTERNAL_ERROR, "Create MemoryAllocator failed.");
    return ge::INTERNAL_ERROR;
  }
  return ge::SUCCESS;
  return InitCachingAllocator(memory_type);
 }

 void MemManager::Finalize() noexcept {
  GELOGI("Finalize.");
  std::lock_guard<std::mutex> lock(allocator_mutex_);
  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
  // caching allocator use memory allocator, so finalize it first
  for (auto &caching_allocator : caching_allocator_map_) {
    if (caching_allocator.second != nullptr) {
      caching_allocator.second->Finalize();
      delete caching_allocator.second;
      caching_allocator.second = nullptr;
    }
  }
  caching_allocator_map_.clear();

  for (auto &memory_allocator : memory_allocator_map_) {
    if (memory_allocator.second != nullptr) {
      memory_allocator.second->Finalize(0);
      memory_allocator.second->Finalize();
      delete memory_allocator.second;
      memory_allocator.second = nullptr;
    }
  }

  if (default_memory_allocator_ != nullptr) {
    delete default_memory_allocator_;
    default_memory_allocator_ = nullptr;
  }
  memory_allocator_map_.clear();
 }

 MemoryAllocator *MemManager::GetMemoryAllocator(rtMemType_t memory_type) {
  std::lock_guard<std::mutex> lock(allocator_mutex_);
  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
  MemoryAllocator *memory_allocator = nullptr;
  auto it = memory_allocator_map_.find(memory_type);
  if (it != memory_allocator_map_.end()) {
@@ -221,9 +222,60 @@ MemoryAllocator *MemManager::GetMemoryAllocator(rtMemType_t memory_type) {
  // Usually impossible
  if (memory_allocator == nullptr) {
    GELOGE(ge::INTERNAL_ERROR, "GetMemoryAllocator failed, memory type is %u.", memory_type);
    return default_memory_allocator_;
    static MemoryAllocator default_memory_allocator(RT_MEMORY_RESERVED);
    return &default_memory_allocator;
  }

  return memory_allocator;
 }

 Status MemManager::InitCachingAllocator(const std::vector<rtMemType_t> &memory_type) {
  CachingAllocator *caching_allocator = nullptr;
  for (unsigned int index : memory_type) {
    auto it = caching_allocator_map_.find(index);
    if (it == caching_allocator_map_.end()) {
      caching_allocator = new (std::nothrow) CachingAllocator(index);
      if (caching_allocator != nullptr) {
        caching_allocator_map_[index] = caching_allocator;
        GELOGI("Create CachingAllocator memory type[%u] success.", index);
      } else {
        GELOGE(ge::INTERNAL_ERROR, "Alloc CachingAllocator failed.");
      }
    } else {
      caching_allocator = it->second;
    }

    if (caching_allocator == nullptr) {
      GELOGE(ge::INTERNAL_ERROR, "Create CachingAllocator failed.");
      return ge::INTERNAL_ERROR;
    } else {
      if (caching_allocator->Initialize() != ge::SUCCESS) {
        return ge::INTERNAL_ERROR;
      }
    }
  }
  return ge::SUCCESS;
 }

 CachingAllocator &MemManager::GetCachingAllocator(rtMemType_t memory_type) {
  std::lock_guard<std::recursive_mutex> lock(allocator_mutex_);
  CachingAllocator *caching_allocator = nullptr;
  auto it = caching_allocator_map_.find(memory_type);
  if (it != caching_allocator_map_.end()) {
    caching_allocator = it->second;
  }

  // Usually impossible
  if (caching_allocator == nullptr) {
    GELOGE(ge::INTERNAL_ERROR, "GetCachingAllocator failed, memory type is %u.", memory_type);
    static CachingAllocator default_caching_allocator(RT_MEMORY_RESERVED);
    return default_caching_allocator;
    ;
  }
  return *caching_allocator;
 }

 CachingAllocator &MemManager::CachingInstance(rtMemType_t memory_type) {
  return Instance().GetCachingAllocator(memory_type);
 }
 }  // namespace ge
--- a/src/ge/graph/manager/graph_mem_allocator.h
+++ b/src/ge/graph/manager/graph_mem_allocator.h
@@ -88,7 +88,7 @@ class MemoryAllocator {
  /// @param [in] device_id device id
  /// @return  memory address
  ///
  uint8_t *MallocMemory(const string &purpose, uint64_t memory_size, uint32_t device_id = 0) const;
  uint8_t *MallocMemory(const string &purpose, size_t memory_size, uint32_t device_id = 0) const;

  ///
  /// @ingroup ge_graph
@@ -108,7 +108,7 @@ class MemoryAllocator {
  /// @param [in] device_id device id
  /// @return memory address
  ///
  uint8_t *MallocMemory(const string &purpose, const string &memory_key, uint64_t memory_size, uint32_t device_id = 0);
  uint8_t *MallocMemory(const string &purpose, const string &memory_key, size_t memory_size, uint32_t device_id = 0);

  ///
  /// @ingroup ge_graph
@@ -135,6 +135,7 @@ class MemoryAllocator {
 };

 using MemoryAllocatorPtr = std::shared_ptr<MemoryAllocator>;
 class CachingAllocator;

 class MemManager {
 public:
@@ -142,6 +143,7 @@ class MemManager {
  virtual ~MemManager();
  static MemManager &Instance();
  static MemoryAllocator *Instance(rtMemType_t memory_type);
  static CachingAllocator &CachingInstance(rtMemType_t memory_type);
  MemManager(const MemManager &) = delete;
  MemManager &operator=(const MemManager &) = delete;
  ///
@@ -164,13 +166,29 @@ class MemManager {
  /// @ingroup ge_graph
  /// @brief ge memory allocator
  /// @param [in] memory_type memory type
  /// @return Status result of function
  /// @return MemoryAllocator ptr
  ///
  MemoryAllocator *GetMemoryAllocator(rtMemType_t memory_type);

  ///
  /// @ingroup ge_graph
  /// @brief ge caching allocator
  /// @param [in] memory_type memory type
  /// @return CachingAllocator ptr
  ///
  CachingAllocator &GetCachingAllocator(rtMemType_t memory_type);

  ///
  /// @ingroup ge_graph
  /// @brief ge create caching allocator
  /// @param [in] memory_type memory type
  /// @return Status result of function
  ///
  Status InitCachingAllocator(const std::vector<rtMemType_t> &memory_type);

  std::map<rtMemType_t, MemoryAllocator *> memory_allocator_map_;
  MemoryAllocator *default_memory_allocator_;
  std::mutex allocator_mutex_;
  std::map<rtMemType_t, CachingAllocator *> caching_allocator_map_;
  std::recursive_mutex allocator_mutex_;
 };
 };  // namespace ge

--- a/src/ge/graph/manager/trans_var_data_utils.cc
+++ b/src/ge/graph/manager/trans_var_data_utils.cc
@@ -25,8 +25,343 @@
 #include "graph/manager/graph_var_manager.h"
 #include "graph/types.h"
 #include "graph/utils/type_utils.h"
 #include "common/thread_pool.h"
 #include <algorithm>

 namespace ge {
 namespace {
 class RtContextSwitchGuard {
 public:
  RtContextSwitchGuard(rtCtxMode_t mode, uint32_t device_id) : last_(nullptr), current_(nullptr) {
    auto ret = rtCtxGetCurrent(&last_);
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Failed to get current context from rt, error-code %d", ret);
      return;
    }

    ret = rtCtxCreate(&current_, mode, static_cast<int32_t>(device_id));
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Failed to create new context for device %u, error-code %d", device_id, ret);
      return;
    }

    ret = rtCtxSetCurrent(current_);
    if (ret != RT_ERROR_NONE) {
      GELOGE(RT_FAILED, "Failed to switch context to normal, context %p, device %u", current_, device_id);
      return;
    }
    GELOGD("Create and switch rt context %p type %d for device %u, backup last %p.", current_, mode, device_id, last_);
  }

  ~RtContextSwitchGuard() {
    if (current_ != nullptr) {
      auto ret = rtCtxDestroy(current_);
      GELOGD("Destory current context %p result %d", current_, ret);
    }
    if (last_ != nullptr) {
      auto ret = rtCtxSetCurrent(last_);
      GELOGD("Recovery last context %p result %d.", last_, ret);
    }
  }

 private:
  rtContext_t last_;
  rtContext_t current_;
 };

 int64_t CalcVarSizeInBytes(const GeTensorDesc &desc) {
  int64_t var_size = GetSizeByDataType(desc.GetDataType());
  if (var_size <= 0) {
    GELOGE(PARAM_INVALID, "Failed to calc var data size from data type %s",
           TypeUtils::DataTypeToSerialString(desc.GetDataType()).c_str());
    return -1;
  }
  auto shape = desc.GetShape();
  auto dim_num = shape.GetDimNum();
  for (size_t dim_index = 0; dim_index < dim_num; ++dim_index) {
    var_size *= shape.GetDim(dim_index);
  }
  return var_size;
 }

 Status CopyVarToDevice(const NodePtr &var, const formats::TransResult &trans_result, void *var_addr) {
  GELOGD("Copy var %s from host to device, size %zu", var->GetName().c_str(), trans_result.length);
  auto ret = rtMemcpy(var_addr, trans_result.length, reinterpret_cast<void *>(trans_result.data.get()),
                      trans_result.length, RT_MEMCPY_HOST_TO_DEVICE);
  if (ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED, "Failed to copy memory to device, size %zu", trans_result.length);
    return RT_FAILED;
  }
  return SUCCESS;
 }

 Status CopyVarFromDevice(uint64_t session_id, const NodePtr &var, std::unique_ptr<uint8_t[]> &var_data,
                         const GeTensorDesc &input_desc) {
  uint8_t *var_logic = nullptr;
  GE_CHECK_NOTNULL(var);
  auto ret = VarManager::Instance(session_id)->GetVarAddr(var->GetName(), input_desc, &var_logic);
  if (ret != SUCCESS) {
    GELOGE(INTERNAL_ERROR,
           "Failed to copy var %s from device, can not find it"
           " from var manager %u",
           var->GetName().c_str(), ret);
    return INTERNAL_ERROR;
  }

  uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM);
  if (var_addr == nullptr) {
    GELOGE(INTERNAL_ERROR,
           "Failed to copy var %s from device, cant not get "
           "var addr from logic addr %p",
           var->GetName().c_str(), var_logic);
    return INTERNAL_ERROR;
  }

  int64_t var_size_bytes = CalcVarSizeInBytes(input_desc);
  if (var_size_bytes <= 0) {
    return INTERNAL_ERROR;
  }

  std::unique_ptr<uint8_t[]> var_host(new (std::nothrow) uint8_t[var_size_bytes]);
  if (var_host == nullptr) {
    GELOGE(OUT_OF_MEMORY, "Failed to malloc rt-host memory, size %ld", var_size_bytes);
    return OUT_OF_MEMORY;
  }

  ret = rtMemcpy(reinterpret_cast<void *>(var_host.get()), var_size_bytes, reinterpret_cast<void *>(var_addr),
                 var_size_bytes, RT_MEMCPY_DEVICE_TO_HOST);
  if (ret != RT_ERROR_NONE) {
    GELOGE(RT_FAILED,
           "Failed to copy var memory from device, var %s, size %ld,"
           " rt-error-code %u",
           var->GetName().c_str(), var_size_bytes, ret);
    return RT_FAILED;
  }

  GELOGD("Copy var %s from device to host, size %ld", var->GetName().c_str(), var_size_bytes);
  var_data.swap(var_host);

  GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr);

  return SUCCESS;
 }

 Status TransVarOnHost(uint8_t *var_data, const VarTransRoad &trans_road, formats::TransResult &result) {
  formats::TransResult result_last_time{};
  bool use_init_data = true;
  for (const auto &trans_info : trans_road) {
    if (trans_info.node_type == RESHAPE || trans_info.node_type == REFORMAT) {
      GELOGD("Skip to trans variable data on the reshape/reformat node");
      continue;
    }
    uint8_t *src_data = nullptr;
    if (use_init_data) {
      src_data = var_data;
      use_init_data = false;
    } else {
      src_data = result_last_time.data.get();
    }

    formats::TransResult tmp_result{};
    if (trans_info.node_type == TRANSDATA || trans_info.node_type == TRANSPOSED) {
      auto src_format = trans_info.input.GetFormat();
      auto src_shape = trans_info.input.GetShape().GetDims();
      auto dst_format = trans_info.output.GetFormat();
      auto dst_shape = trans_info.output.GetShape().GetDims();
      auto data_type = trans_info.input.GetDataType();
      GELOGD("Trans format from %s to %s, shape %s to %s, data-type %s",
             TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(),
             formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(),
             TypeUtils::DataTypeToSerialString(data_type).c_str());
      auto ret = formats::TransFormat({src_data, src_format, dst_format, src_shape, dst_shape, data_type}, tmp_result);
      if (ret != SUCCESS) {
        GELOGE(INTERNAL_ERROR,
               "Failed to trans format from %s to %s, shape %s to %s, "
               "data type %s error code %u",
               TypeUtils::FormatToSerialString(src_format).c_str(), TypeUtils::FormatToSerialString(dst_format).c_str(),
               formats::ShapeToString(src_shape).c_str(), formats::ShapeToString(dst_shape).c_str(),
               TypeUtils::DataTypeToSerialString(data_type).c_str(), ret);
        return ret;
      }
    } else if (trans_info.node_type == CAST) {
      auto input_shape = trans_info.input.GetShape();
      auto src_data_size = input_shape.GetShapeSize() == 0 ? 1 : input_shape.GetShapeSize();
      auto src_data_type = trans_info.input.GetDataType();
      auto dst_data_type = trans_info.output.GetDataType();
      GELOGD("Trans data type from %s to %s, input shape %s, data size %ld",
             TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
             TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
             src_data_size);
      auto ret = formats::TransDataType({src_data, static_cast<size_t>(src_data_size), src_data_type, dst_data_type},
                                        tmp_result);
      if (ret != SUCCESS) {
        GELOGE(INTERNAL_ERROR, "Failed to trans data type from %s to %s, input shape %s, data size %ld, error code %u",
               TypeUtils::DataTypeToSerialString(src_data_type).c_str(),
               TypeUtils::DataTypeToSerialString(dst_data_type).c_str(), formats::ShapeToString(input_shape).c_str(),
               src_data_size, ret);
        return ret;
      }
    } else {
      GELOGE(UNSUPPORTED, "Failed to trans var data, the trans type %s does not supported",
             trans_info.node_type.c_str());
      return UNSUPPORTED;
    }
    result_last_time = tmp_result;
  }

  result = result_last_time;
  return SUCCESS;
 }

 /// re-alloc var memory on device using var-manager
 /// free origin var memory(var manager does not support now)
 /// @param session_id
 /// @param var
 /// @param var_size_bytes
 /// @param var_device
 /// @return
 Status ReAssignVarAddr(uint64_t session_id, const std::string &var_name, const GeTensorDesc &tensor_desc,
                       void **var_device) {
  uint8_t *var_logic = nullptr;
  Status ret = VarManager::Instance(session_id)->GetVarAddr(var_name, tensor_desc, &var_logic);
  if (ret != SUCCESS) {
    GELOGE(INTERNAL_ERROR,
           "Failed to get var %s device addr, can not find it"
           " from var manager %u",
           var_name.c_str(), ret);
    return INTERNAL_ERROR;
  }

  uint8_t *var_addr = VarManager::Instance(session_id)->GetVarMemoryAddr(var_logic, RT_MEMORY_HBM);
  if (var_addr == nullptr) {
    GELOGE(INTERNAL_ERROR, "Failed to convert var %s logic addr to real addr", var_name.c_str());
    return INTERNAL_ERROR;
  }
  *var_device = var_addr;

  GELOGI("var_logic:%p, var_addr:%p", var_logic, var_addr);

  return SUCCESS;
 }

 Status TransVarData(const NodePtr &var, const VarTransRoad &trans_road, uint64_t session_id) {
  // do not need to do anything if only all reshape/reformat node on the trans_road
  GE_CHECK_NOTNULL(var);
  bool need_trans = false;
  for (auto &road : trans_road) {
    if (road.node_type != RESHAPE && road.node_type != REFORMAT) {
      need_trans = true;
      break;
    }
  }
  if (!need_trans) {
    return SUCCESS;
  }

  // Sync var data from device
  std::unique_ptr<uint8_t[]> var_data;
  if (trans_road.empty()) {
    GELOGE(INTERNAL_ERROR, "Failed to get trans_road, trans_road is empty.");
    return INTERNAL_ERROR;
  }
  const GeTensorDesc &input_desc = trans_road.begin()->input;
  auto ret = CopyVarFromDevice(session_id, var, var_data, input_desc);
  if (ret != SUCCESS) {
    return ret;
  }

  formats::TransResult trans_result{};
  ret = TransVarOnHost(var_data.get(), trans_road, trans_result);
  if (ret != SUCCESS) {
    GELOGE(ret, "Failed to trans var data on host, error code %u", ret);
    return ret;
  }

  void *var_device = nullptr;

  /// It is a temporary solution to use the last GeTensorDesc to assign variable memory because the variable manager
  /// depends on TensorDesc and it is difficult to be modified. The correct solution is to assign memory based on the
  /// size of the converted variable. To complete the final solution, the dependency of the variable manager on
  /// TensorDesc needs to be removed. This change is large and needs to be performed step by step.
  ret = ReAssignVarAddr(session_id, var->GetName(), trans_road.rbegin()->output, &var_device);
  if (ret != SUCCESS) {
    GELOGE(ret, "Failed to re-assign memory on device, size %zu", trans_result.length);
    return ret;
  }

  // sync new data to device
  ret = CopyVarToDevice(var, trans_result, var_device);
  if (ret != SUCCESS) {
    GELOGE(ret, "Failed to send var data to device");
    return ret;
  }

  return SUCCESS;
 }

 Status TransTensor(uint8_t *var_data, const NodePtr &var_src, const NodePtr &var_dst, formats::TransResult &result) {
  GE_CHECK_NOTNULL(var_src);
  GE_CHECK_NOTNULL(var_src->GetOpDesc());
  GE_CHECK_NOTNULL(var_dst);
  GE_CHECK_NOTNULL(var_dst->GetOpDesc());
  auto src_data_shape_size = var_src->GetOpDesc()->GetOutputDesc(0).GetShape().GetShapeSize();
  auto src_data_datatype = var_src->GetOpDesc()->GetOutputDesc(0).GetDataType();
  auto dst_data_datatype = var_dst->GetOpDesc()->GetOutputDesc(0).GetDataType();
  GE_IF_BOOL_EXEC(
    src_data_datatype != dst_data_datatype,
    auto ret = formats::TransDataType(
      {var_data, static_cast<size_t>(src_data_shape_size), src_data_datatype, dst_data_datatype}, result);
    if (ret != SUCCESS) {
      GELOGE(INTERNAL_ERROR, "trans var data on host failed");
      return ret;
    });
  return SUCCESS;
 }

 Status CopyTensorFromSrcVarNode(const NodePtr &var_src, const NodePtr &var_dst, uint64_t session_id,
                                uint32_t device_id) {
  /// after FE fusion pass, input num of applymomentum op was changed, 0th input is var_fp32, 6th input is
  /// var_fp16(new).
  /// unlink edges between var_fp32 and "dst_node" (need fp16) of var_fp32, add edge between var_fp16 and dst_node.
  /// need copy value from var_fp32 to var_fp16.
  /// [opdesc of var_src and var_dst are checked before passed in, no need to check if they are nullptr]
  GE_IF_BOOL_EXEC(var_src == nullptr || var_dst == nullptr, GELOGE(FAILED, "node var is nullptr"); return FAILED);
  // src_node output_desc (fp32)
  GeTensorDesc output_desc = var_src->GetOpDesc()->GetOutputDesc(0);
  auto src_data_type = output_desc.GetDataType();
  auto src_shape = output_desc.GetShape();
  auto src_format = output_desc.GetFormat();
  GELOGI("src_node %s, src_format %s, src_shape %s, src_type %s", var_src->GetName().c_str(),
         TypeUtils::FormatToSerialString(src_format).c_str(), formats::ShapeToString(src_shape).c_str(),
         TypeUtils::DataTypeToSerialString(src_data_type).c_str());
  // dst_node output_desc (fp16)
  GeTensorDesc dst_tensor_desc = var_dst->GetOpDesc()->GetOutputDesc(0);
  auto data_type = dst_tensor_desc.GetDataType();
  auto data_shape = dst_tensor_desc.GetShape();
  auto data_format = dst_tensor_desc.GetFormat();
  GELOGI("dst_node %s, src_format %s, src_shape %s, src_type %s", var_dst->GetName().c_str(),
         TypeUtils::FormatToSerialString(data_format).c_str(), formats::ShapeToString(data_shape).c_str(),
         TypeUtils::DataTypeToSerialString(data_type).c_str());
  // Sync var data from device
  std::unique_ptr<uint8_t[]> var_src_data;
  RtContextSwitchGuard switch_context(RT_CTX_NORMAL_MODE, device_id);
  // copy from src_node
  auto ret = CopyVarFromDevice(session_id, var_src, var_src_data, output_desc);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "Copy Var From Device failed"); return ret);
  // trans dtype
  formats::TransResult trans_result{};
  ret = TransTensor(var_src_data.get(), var_src, var_dst, trans_result);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "trans var data on host failed"); return ret);
  // reset src value.
  void *var_device = nullptr;
  ret = ReAssignVarAddr(session_id, var_dst->GetName(), dst_tensor_desc, &var_device);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(INTERNAL_ERROR, "assign mem failed"); return ret);
  // copy to device
  ret = CopyVarToDevice(var_dst, trans_result, var_device);
  GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(ret, "Failed to send var data to device"); return ret);
  return SUCCESS;
 }
 }  // namespace
 Status TransVarDataUtils::SyncVarData2BroadCast(const string &var_name, const ge::GeTensorDesc &src_tensor_desc,
                                                uint8_t *dst_addr, int64_t dst_addr_size, uint64_t session_id) {
  GE_CHK_BOOL_RET_STATUS(dst_addr != nullptr, FAILED, "dst addr is null. ");
@@ -88,4 +423,101 @@ Status TransVarDataUtils::SyncTensorToDevice(const string &var_name, const uint8

  return SUCCESS;
 }

 Status TransVarDataUtils::TransAllVarData(const vector<NodePtr> &variable_nodes, uint64_t session_id,
                                          rtContext_t context, uint32_t graph_id, uint32_t thread_num) {
  ThreadPool executor(thread_num);
  std::vector<std::future<Status>> vector_future;
  for (auto &node : variable_nodes) {
    if (node == nullptr) {
      continue;
    }

    if (node->GetType() != VARIABLE) {
      continue;
    }

    std::future<Status> f = executor.commit(
      [](const ge::NodePtr &node, uint64_t session_id, rtContext_t ctx, uint32_t graph_id) -> Status {
        rtError_t rt_ret = rtCtxSetCurrent(ctx);
        if (rt_ret != RT_ERROR_NONE) {
          GELOGE(RT_FAILED, "Failed to set context, error_code is: 0x%X.", rt_ret);
          return RT_FAILED;
        }
        uint32_t allocated_graph_id = 0;
        Status ret = VarManager::Instance(session_id)->GetAllocatedGraphId(node->GetName(), allocated_graph_id);
        if (ret != SUCCESS) {
          GELOGE(INTERNAL_ERROR, "var has not been allocated, node:%s, graph_id:%u.", node->GetName().c_str(),
                 graph_id);
          return INTERNAL_ERROR;
        }
        uint32_t changed_graph_id = 0;
        ret = VarManager::Instance(session_id)->GetChangedGraphId(node->GetName(), changed_graph_id);
        bool call_trans_var =
          (ret == SUCCESS && changed_graph_id == graph_id && changed_graph_id != allocated_graph_id);
        if (call_trans_var) {
          GELOGI("VarManager::GetChangedGraphId() success, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id);
          VarTransRoad *trans_road = VarManager::Instance(session_id)->GetTransRoad(node->GetName());
          if (trans_road == nullptr) {
            GELOGI("The variable %s does not have any trans road", node->GetName().c_str());
            return SUCCESS;
          }
          ret = TransVarData(node, *trans_road, session_id);
          if (ret != SUCCESS) {
            GELOGE(INTERNAL_ERROR, "TransVarData failed, node:%s, graph_id:%u.", node->GetName().c_str(), graph_id);
            return INTERNAL_ERROR;
          }
          VarManager::Instance(session_id)->RemoveChangedGraphId(node->GetName());
        }
        return SUCCESS;
      },
      node, session_id, context, graph_id);
    if (!f.valid()) {
      GELOGE(FAILED, "Future is invalid");
      return FAILED;
    }
    vector_future.push_back(std::move(f));
  }

  Status ret_status;
  for (size_t i = 0; i < vector_future.size(); ++i) {
    ret_status = vector_future[i].get();
    if (ret_status != SUCCESS) {
      GELOGE(ret_status, "TransAllVarData:: trans %zu vardata failed", i);
      return ret_status;
    }
  }

  return SUCCESS;
 }

 Status TransVarDataUtils::CopyVarData(const ComputeGraphPtr &compute_graph, uint64_t session_id, uint32_t device_id) {
  GELOGI("CopyVarData start: session_id:%lu.", session_id);
  if (compute_graph == nullptr) {
    GELOGE(FAILED, "compute_graph is nullptr");
    return FAILED;
  }

  string cp_from_node;
  bool copy_value = false;
  for (auto &node : compute_graph->GetAllNodes()) {
    GE_IF_BOOL_EXEC(node->GetOpDesc() == nullptr || node->GetOpDesc()->GetType() != VARIABLE, continue);
    GE_IF_BOOL_EXEC(ge::AttrUtils::GetStr(node->GetOpDesc(), "_copy_from_var_node", cp_from_node),
                    GELOGI("Get original type of cp_from_node"));
    if (cp_from_node.length() != 0) {
      (void)ge::AttrUtils::GetBool(node->GetOpDesc(), "_copy_value", copy_value);  // no need to check value
      if (!copy_value) {
        auto src_node = compute_graph->FindNode(cp_from_node);
        GE_CHECK_NOTNULL(src_node);
        GELOGI("current_var_node__: [%s] copy_from_var_node__: [%s].", node->GetName().c_str(),
               src_node->GetName().c_str());
        auto ret = CopyTensorFromSrcVarNode(src_node, node, session_id, device_id);
        GE_IF_BOOL_EXEC(ret != SUCCESS, GELOGE(FAILED, "copy tensor failed!"); return FAILED);
        // only copy once
        (void)ge::AttrUtils::SetBool(node->GetOpDesc(), "_copy_value", true);  // no need to check value
      }
    }
  }
  return SUCCESS;
 }
 }  // namespace ge
--- a/src/ge/graph/manager/trans_var_data_utils.h
+++ b/src/ge/graph/manager/trans_var_data_utils.h
@@ -22,6 +22,9 @@
 #include "framework/common/ge_inner_error_codes.h"
 #include "framework/common/ge_types.h"
 #include "graph/utils/tensor_utils.h"
 #include "graph/node.h"
 #include "runtime/context.h"
 #include "graph_var_manager.h"

 namespace ge {
 class TransVarDataUtils {
@@ -31,6 +34,11 @@ class TransVarDataUtils {
  static ge::Status SyncBroadCastData2Var(uint8_t *src_addr, int64_t src_addr_size, const string &var_name,
                                          const ge::GeTensorDesc &dst_tensor_desc, uint64_t session_id_);

  static ge::Status TransAllVarData(const std::vector<NodePtr> &variable_nodes, uint64_t session_id,
                                    rtContext_t context, uint32_t graph_id, uint32_t thread_num = 16);

  static ge::Status CopyVarData(const ComputeGraphPtr &compute_graph, uint64_t session_id, uint32_t device_id);

 private:
  static ge::Status SyncTensorToHost(const string &var_name, const ge::GeTensorDesc &src_tensor_desc,
                                     uint8_t **host_addr, int64_t &addr_size, uint64_t session_id_);
--- a/src/ge/graph/manager/util/hcom_util.cc
+++ b/src/ge/graph/manager/util/hcom_util.cc
@@ -24,35 +24,49 @@
 #include "graph/utils/type_utils.h"

 namespace ge {
 Status HcomOmeUtil::GetHcomDataType(const ge::ConstOpDescPtr &op_desc, hcclDataType_t &data_type) {

 Status HcomOmeUtil::GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
                                    std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
    return PARAM_INVALID;
  }
  GELOGI("GetHcclDataType start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str());
  if (op_desc->GetType() == HVDWAIT) {
    return SUCCESS;
  }
  ge::DataType src_data_type = ge::DT_FLOAT;
  if (op_desc->GetType() == HCOMRECEIVE) {
    bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type);
    if (ret == false) {
      GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype.");
  for (size_t i = 0; i < kernel_hccl_infos.size(); i++) {
    if (op_desc->GetType() == HCOMRECEIVE) {
      bool ret = ge::AttrUtils::GetDataType(op_desc, HCOM_ATTR_DATA_TYPE, src_data_type);
      if (ret == false) {
        GELOGE(PARAM_INVALID, "op:HcomReceive, op desc no attr: dtype.");
        return PARAM_INVALID;
      }
    } else {
      auto input_desc_ptr = op_desc->GetInputDescPtr(i);
      GE_CHECK_NOTNULL(input_desc_ptr);
      src_data_type = input_desc_ptr->GetDataType();
    }

    auto iter = kConstOpHcclDataType.find(static_cast<int64_t>(src_data_type));
    if (iter == kConstOpHcclDataType.end()) {
      GELOGE(PARAM_INVALID,
             "HcomOmeUtil::  Node: %s Optype: %s HcomDataType cann't support! Current Davinci Data Type : %s",
             op_desc->GetName().c_str(), op_desc->GetType().c_str(),
             ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str());
      return PARAM_INVALID;
    }
  } else {
    auto input_desc_ptr = op_desc->GetInputDescPtr(0);
    GE_CHECK_NOTNULL(input_desc_ptr);
    src_data_type = input_desc_ptr->GetDataType();
  }

  auto iter = kConstOpHcomDataType.find(static_cast<int64_t>(src_data_type));
  if (iter == kConstOpHcomDataType.end()) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: HcomDataType cann't support! Current Davinci Data Type : %s",
           ge::TypeUtils::DataTypeToSerialString(src_data_type).c_str());
    return PARAM_INVALID;
    kernel_hccl_infos[i].dataType = iter->second;
  }

  data_type = iter->second;
  return SUCCESS;
 }

 Status HcomOmeUtil::GetHcomTypeSize(hcclDataType_t data_type, int32_t &size) {
  auto iter = kConstOpHcomDataTypeSize.find(data_type);
  GE_CHK_BOOL_EXEC(iter != kConstOpHcomDataTypeSize.end(), return PARAM_INVALID,
 Status HcomOmeUtil::GetHcclTypeSize(hcclDataType_t data_type, int32_t &size) {
  auto iter = kConstOpHcclDataTypeSize.find(data_type);
  GE_CHK_BOOL_EXEC(iter != kConstOpHcclDataTypeSize.end(), return PARAM_INVALID,
                   "HcomOmeUtil::HcomDataTypeSize , No DataTypeSize!");

  size = iter->second;
@@ -62,10 +76,14 @@ Status HcomOmeUtil::GetHcomTypeSize(hcclDataType_t data_type, int32_t &size) {
 Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather,
                                 int &count) {
  GE_CHECK_NOTNULL(op_desc);
  if (!IsHCOMOp(op_desc->GetType())) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Hcom operator.");
    return PARAM_INVALID;
  }
  int64_t total_size = 0;
  int64_t align_size = 512;
  int32_t size = 0;
  GE_CHK_STATUS_RET(HcomOmeUtil::GetHcomTypeSize(data_type, size), "GetHcomCount: GetHcomTypeSize fail!");
  GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(data_type, size), "GetHcomCount: GetHcclTypeSize fail!");
  if (op_desc->GetType() == HCOMRECEIVE) {
    vector<int64_t> shape_dims;
    bool ret = ge::AttrUtils::GetListInt(op_desc, HCOM_ATTR_SHAPE, shape_dims);
@@ -114,34 +132,207 @@ Status HcomOmeUtil::GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType
  return SUCCESS;
 }

 Status HcomOmeUtil::GetHcomOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type) {
 Status HcomOmeUtil::GetHorovodCount(const ge::ConstOpDescPtr &op_desc,
                                    std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  if (!IsHorovodOp(op_desc->GetType())) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: operator is not Horovod operator.");
    return PARAM_INVALID;
  }
  int64_t align_size = 512;
  int32_t size = 0;
  for (size_t i = 0; i < op_desc->GetInputsSize(); i++) {
    GE_CHK_STATUS_RET(HcomOmeUtil::GetHcclTypeSize(static_cast<tagHcclDataType>(kernel_hccl_infos[i].dataType), size),
                      "GetHorovodCount: GetHcclTypeSize fail!");
    int64_t input_size = 0;
    int64_t block_size = 0;
    GE_CHECK_NOTNULL(op_desc->GetInputDescPtr(i));
    GE_CHK_STATUS_RET(ge::TensorUtils::GetSize(*op_desc->GetInputDescPtr(i), input_size),
                      "get size from TensorDesc failed, op : %s, input index : %zu", op_desc->GetName().c_str(), i);

  std::string hcom_op_type;
  GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), return PARAM_INVALID,
                   "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, not support!");

  if (hcom_op_type == "min") {
    op_type = HCCL_REP_OP_MIN;
  } else if (hcom_op_type == "max") {
    op_type = HCCL_REP_OP_MAX;
  } else if (hcom_op_type == "prod") {
    op_type = HCCL_REP_OP_PROD;
  } else if (hcom_op_type == "sum") {
    op_type = HCCL_REP_OP_SUM;
  } else {
    GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str());
    int64_t shape_size = op_desc->GetInputDescPtr(i)->GetShape().GetShapeSize();
    GE_CHK_STATUS_RET(ge::CheckInt64Int32MulOverflow(shape_size, size),
                      "Product of shape size and size beyond INT64_MAX");
    if (kernel_hccl_infos[0].hccl_type == HVDCALLBACKALLGATHER) {
      block_size = shape_size * size;
    } else {
      block_size = (input_size + align_size - 1) / align_size * align_size;
    }

    GE_CHK_BOOL_RET_STATUS(size != 0, PARAM_INVALID, "Size is zero");
    GE_CHK_BOOL_EXEC(block_size % size == 0, return PARAM_INVALID, "block_size:%ld is not divisiable by size:%d.",
                     block_size, size);
    kernel_hccl_infos[i].count = static_cast<int>(block_size / size);
  }

  return SUCCESS;
 }

 Status HcomOmeUtil::GetHcclCount(const ge::ConstOpDescPtr &op_desc,
                                 std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  Status ret;
  ret = CheckKernelHcclInfo(op_desc, kernel_hccl_infos);
  if (ret != SUCCESS) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: the number of GETaskKernelHcclInfo is invalid.");
    return PARAM_INVALID;
  }
  GELOGI("GetHcclCount start, node[%s], opType[%s].", op_desc->GetName().c_str(), op_desc->GetType().c_str());
  if (IsHCOMOp(op_desc->GetType())) {
    int32_t count = 0;
    ret = GetHcomCount(op_desc, static_cast<tagHcclDataType>(kernel_hccl_infos[0].dataType),
                       kernel_hccl_infos[0].hccl_type == HCOMALLGATHER, count);
    if (ret != SUCCESS) {
      GELOGE(ret, "HcomOmeUtil:: Node: %s Optype: %s get the Hcom operator hccl count fail.",
             op_desc->GetName().c_str(), op_desc->GetType().c_str());
      return PARAM_INVALID;
    }
    kernel_hccl_infos[0].count = count;
  }

  if (IsHorovodOp(op_desc->GetType())) {
    ret = GetHorovodCount(op_desc, kernel_hccl_infos);
    if (ret != SUCCESS) {
      GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s get the Horovod hccl operator count fail.",
             op_desc->GetName().c_str(), op_desc->GetType().c_str());
      return PARAM_INVALID;
    }
  }

  return SUCCESS;
 }

 Status HcomOmeUtil::GetHcomRootId(const ge::ConstOpDescPtr &op_desc, int64_t &root_id) {
 Status HcomOmeUtil::GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type) {
  GE_CHECK_NOTNULL(op_desc);

  if (IsHCOMOp(op_desc->GetType())) {
    std::string hcom_op_type;
    GE_CHK_BOOL_EXEC(ge::AttrUtils::GetStr(op_desc, HCOM_ATTR_REDUCE_TYPE, hcom_op_type), return PARAM_INVALID,
                     "HcomOmeUtil:: Node: %s Optype: %s Get HCOM_ATTR_REDUCE_TYPE fail, not support!",
                     op_desc->GetName().c_str(), op_desc->GetType().c_str());

    if (hcom_op_type == "min") {
      op_type = HCCL_REP_OP_MIN;
    } else if (hcom_op_type == "max") {
      op_type = HCCL_REP_OP_MAX;
    } else if (hcom_op_type == "prod") {
      op_type = HCCL_REP_OP_PROD;
    } else if (hcom_op_type == "sum") {
      op_type = HCCL_REP_OP_SUM;
    } else {
      GELOGE(PARAM_INVALID, "HcomOmeUtil::Get HCOM_ATTR_REDUCE_TYPE fail, [%s] not support!", hcom_op_type.c_str());
      return PARAM_INVALID;
    }
  }

  if (IsHorovodOp(op_desc->GetType())) {
    int64_t horovod_op_type;
    GE_CHK_BOOL_EXEC(ge::AttrUtils::GetInt(op_desc, ATTR_HOROVOD_ATTR_REDUCE_TYPE, horovod_op_type),
                     return PARAM_INVALID,
                     "HcomOmeUtil:: Node: %s Optype: %s Get ATTR_HOROVOD_ATTR_REDUCE_TYPE fail, not support!",
                     op_desc->GetName().c_str(), op_desc->GetType().c_str());

    auto iter = kHorovodRedOpToHcclRedOp.find(static_cast<horovodRedOp_t>(horovod_op_type));
    if (iter == kHorovodRedOpToHcclRedOp.end()) {
      GELOGE(PARAM_INVALID, "HcomOmeUtil::  Node: %s Optype: %s HcomOpType cann't support! Current HcomOpType : %ld",
             op_desc->GetName().c_str(), op_desc->GetType().c_str(), horovod_op_type);
      return PARAM_INVALID;
    }
    op_type = iter->second;
  }

  return SUCCESS;
 }

 Status HcomOmeUtil::GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &root_id) {
  GE_CHECK_NOTNULL(op_desc);
  GE_CHK_BOOL_EXEC(ge::AttrUtils::GetInt(op_desc, HCOM_ATTR_ROOT_RANK, root_id), return PARAM_INVALID,
                   "HcomOmeUtil::Get HCOM_ATTR_ROOT_INDEX fail, not support!");
                   "HcomOmeUtil::Node %s Optype: %s Get HCOM_ATTR_ROOT_INDEX fail, not support!",
                   op_desc->GetName().c_str(), op_desc->GetType().c_str());

  return SUCCESS;
 }

 Status HcomOmeUtil::GetAllRootId(const ge::ConstOpDescPtr &op_desc,
                                 std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  if (op_desc->GetType() == HCOMBROADCAST || op_desc->GetType() == HVDCALLBACKBROADCAST) {
    GELOGI("GetAllRootId Node[%s] opType[%s] get hccl rootId.", op_desc->GetName().c_str(), op_desc->GetType().c_str());
    int64_t root_id = 0;
    Status dmrt = GetHcclRootId(op_desc, root_id);
    if (dmrt != SUCCESS) {
      GELOGE(FAILED, "davinci_model: GetHcomRootId fail! domi error: %u", dmrt);
      return FAILED;
    }

    for (size_t i = 0; i < kernel_hccl_infos.size(); i++) {
      kernel_hccl_infos[i].rootId = root_id;
    }
  }
  return SUCCESS;
 }

 bool HcomOmeUtil::IsHCOMOp(const string &op_type) {
  return (op_type == HCOMALLREDUCE) || (op_type == HCOMALLGATHER) || (op_type == HCOMBROADCAST) ||
         (op_type == HCOMSEND) || (op_type == HCOMRECEIVE) || (op_type == HCOMREDUCESCATTER);
 }

 bool HcomOmeUtil::IsHorovodOp(const string &op_type) {
  return (op_type == HVDCALLBACKALLREDUCE) || (op_type == HVDCALLBACKALLGATHER) || (op_type == HVDCALLBACKBROADCAST) ||
         (op_type == HVDWAIT);
 }

 Status HcomOmeUtil::CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc,
                                        std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  if (IsHCOMOp(op_desc->GetType()) && kernel_hccl_infos.size() != 1) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Hcom scenario, the number of GETaskKernelHcclInfo is invalid.");
    return PARAM_INVALID;
  }

  if (IsHorovodOp(op_desc->GetType())) {
    if (op_desc->GetType() == HVDWAIT) {
      return SUCCESS;
    }
    if (kernel_hccl_infos.empty() || op_desc->GetInputsSize() != kernel_hccl_infos.size()) {
      GELOGE(PARAM_INVALID, "HcomOmeUtil:: in Horovod scenario, the number of GETaskKernelHcclInfo is invalid.");
      return PARAM_INVALID;
    }
  }
  return SUCCESS;
 }

 void HcomOmeUtil::GetHcclType(const domi::TaskDef &task_def, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  auto hccl_def = task_def.kernel_hccl();
  std::string hccl_type = hccl_def.hccl_type();
  for (size_t i = 0; i < kernel_hccl_infos.size(); i++) {
    kernel_hccl_infos[i].hccl_type = hccl_type;
  }
 }

 Status HcomOmeUtil::GetHorovodInputs(const ge::ConstOpDescPtr &op_desc,
                                     std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos) {
  GE_CHECK_NOTNULL(op_desc);
  if (!IsHorovodOp(op_desc->GetType())) {
    return SUCCESS;
  }

  if (CheckKernelHcclInfo(op_desc, kernel_hccl_infos) != SUCCESS) {
    GELOGE(PARAM_INVALID, "HcomOmeUtil:: Node: %s Optype: %s the number of GETaskKernelHcclInfo is invalid.",
           op_desc->GetName().c_str(), op_desc->GetType().c_str());
    return PARAM_INVALID;
  }

  if (op_desc->GetType() == HVDWAIT) {
    return SUCCESS;
  }

  for (size_t i = 0; i < op_desc->GetInputsSize(); i++) {
    ConstGeTensorDescPtr input_desc = op_desc->GetInputDescPtr(i);
    GETaskKernelHcclInfo &kernel_hccl_info = kernel_hccl_infos.at(i);
    kernel_hccl_info.input_name = op_desc->GetInputNameByIndex(i);
    kernel_hccl_info.dims = input_desc->GetShape().GetDims();
  }
  return SUCCESS;
 }
 }  // namespace ge
--- a/src/ge/graph/manager/util/hcom_util.h
+++ b/src/ge/graph/manager/util/hcom_util.h
@@ -22,72 +22,146 @@
 #include <vector>

 #include "common/debug/log.h"
 #include "common/opskernel/ge_task_info.h"
 #include "common/string_util.h"
 #include "common/types.h"
 #include "common/util.h"
 #include "graph/op_desc.h"
 #include "hccl/hcom.h"
 #include "proto/task.pb.h"

 namespace ge {
 using std::string;
 using std::vector;

 static std::map<int64_t, hcclDataType_t> kConstOpHcomDataType = {
    {ge::DT_FLOAT, HCCL_DATA_TYPE_FLOAT},
    {ge::DT_FLOAT16, HCCL_DATA_TYPE_HALF},
    {ge::DT_INT8, HCCL_DATA_TYPE_INT8},
    {ge::DT_INT32, HCCL_DATA_TYPE_INT},
 static std::map<int64_t, hcclDataType_t> kConstOpHcclDataType = {
  {ge::DT_FLOAT, HCCL_DATA_TYPE_FLOAT},
  {ge::DT_FLOAT16, HCCL_DATA_TYPE_HALF},
  {ge::DT_INT8, HCCL_DATA_TYPE_INT8},
  {ge::DT_INT32, HCCL_DATA_TYPE_INT},
 };

 static std::map<hcclDataType_t, int32_t> kConstOpHcomDataTypeSize  = {
    {HCCL_DATA_TYPE_FLOAT, sizeof(float)},
    {HCCL_DATA_TYPE_HALF, sizeof(float) / 2},
    {HCCL_DATA_TYPE_INT8, sizeof(int8_t)},
    {HCCL_DATA_TYPE_INT, sizeof(int32_t)},
 static std::map<hcclDataType_t, int32_t> kConstOpHcclDataTypeSize = {
  {HCCL_DATA_TYPE_FLOAT, sizeof(float)},
  {HCCL_DATA_TYPE_HALF, sizeof(float) / 2},
  {HCCL_DATA_TYPE_INT8, sizeof(int8_t)},
  {HCCL_DATA_TYPE_INT, sizeof(int32_t)},
 };

 static std::map<horovodRedOp_t, hcclRedOp_t> kHorovodRedOpToHcclRedOp = {
  {HOROVOD_REP_OP_SUM, HCCL_REP_OP_SUM},           {HOROVOD_REP_OP_MIN, HCCL_REP_OP_MIN},
  {HOROVOD_REP_OP_MAX, HCCL_REP_OP_MAX},           {HOROVOD_REP_OP_PROD, HCCL_REP_OP_PROD},
  {HOROVOD_REP_OP_RESERVED, HCCL_REP_OP_RESERVED},
 };

 class HcomOmeUtil {
 public:
  ///
  /// @ingroup domi_ome
  /// @brief GetHcomDataType
  /// @brief GetHcclDataType
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcomDataType(const ge::ConstOpDescPtr &op_desc, hcclDataType_t &data_type);
  static Status GetHcclDataType(const ge::ConstOpDescPtr &op_desc,
                                std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

  ///
  /// @ingroup domi_ome
  /// @brief GetHcomTypeSize
  /// @brief GetHcclTypeSize
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcomTypeSize(hcclDataType_t data_type, int32_t &size);
  static Status GetHcclTypeSize(hcclDataType_t data_type, int32_t &size);

  ///
  /// @ingroup domi_ome
  /// @brief GetHcomCount
  /// @brief GetHcclCount
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather,
                             int &count);
  static Status GetHcclCount(const ge::ConstOpDescPtr &op_desc, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

  ///
  /// @ingroup domi_ome
  /// @brief GetHcclOperationType
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcclOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type);

  ///
  /// @ingroup domi_ome
  /// @brief GetHcclRootId
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcclRootId(const ge::ConstOpDescPtr &op_desc, int64_t &root_id);

  ///
  /// @ingroup domi_ome
  /// @brief GetAllRootId
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetAllRootId(const ge::ConstOpDescPtr &op_desc, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

  ///
  /// @ingroup domi_ome
  /// @brief check the op_type whether is hcom operator or not
  /// @return true
  /// @return false
  ///
  static bool IsHCOMOp(const string &op_type);

  ///
  /// @ingroup domi_ome
  /// @brief check the op_type whether is horovod operator or not
  /// @return true
  /// @return false
  ///
  static bool IsHorovodOp(const string &op_type);

  ///
  /// @ingroup domi_ome
  /// @brief GetHcclType
  /// @return void
  ///
  static void GetHcclType(const domi::TaskDef &task_def, std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

  ///
  /// @ingroup domi_ome
  /// @brief GetHcomOperationType
  /// @brief CheckKernelHcclInfo
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status CheckKernelHcclInfo(const ge::ConstOpDescPtr &op_desc,
                                    std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);
  ///
  /// @ingroup domi_ome
  /// @brief GetHorovodInputs
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcomOperationType(const ge::ConstOpDescPtr &op_desc, hcclRedOp_t &op_type);
  static Status GetHorovodInputs(const ge::ConstOpDescPtr &op_desc,
                                 std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);

 private:
  ///
  /// @ingroup domi_ome
  /// @brief GetHcomCount
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcomCount(const ge::ConstOpDescPtr &op_desc, hcclDataType_t data_type, bool is_allgather,
                             int &count);
  ///
  /// @ingroup domi_ome
  /// @brief GetHcomRootId
  /// @brief GetHorovodCount
  /// @return SUCCESS
  /// @return FAIL
  ///
  static Status GetHcomRootId(const ge::ConstOpDescPtr &op_desc, int64_t &root_id);
  static Status GetHorovodCount(const ge::ConstOpDescPtr &op_desc,
                                std::vector<GETaskKernelHcclInfo> &kernel_hccl_infos);
 };
 }  // namespace ge
 #endif  // GE_GRAPH_MANAGER_UTIL_HCOM_UTIL_H_
--- a/src/ge/graph/optimize/graph_optimize.cc
+++ b/src/ge/graph/optimize/graph_optimize.cc
@@ -134,7 +134,7 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) {
    return GE_CLI_GE_NOT_INITIALIZED;
  }

  std::map<string, GraphOptimizerPtr> graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjs();
  auto graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjsByPriority();
  GELOGI("optimize by opskernel in original graph optimize phase. num of graph_optimizer is %lu.",
         graph_optimizer.size());
  string exclude_core_Type = (core_type_ == kVectorCore) ? kAicoreEngine : kVectorEngine;
@@ -154,6 +154,37 @@ Status GraphOptimize::OptimizeOriginalGraph(ComputeGraphPtr &compute_graph) {
  return ret;
 }

 Status GraphOptimize::OptimizeOriginalGraphJudgeInsert(ComputeGraphPtr &compute_graph) {
  GELOGD("OptimizeOriginalGraphJudgeInsert in");
  GE_CHECK_NOTNULL(compute_graph);
  Status ret = SUCCESS;
  std::shared_ptr<GELib> instance_ptr = ge::GELib::GetInstance();
  if (instance_ptr == nullptr || !instance_ptr->InitFlag()) {
    GELOGE(GE_CLI_GE_NOT_INITIALIZED, "OptimizeOriginalGraph failed.");
    return GE_CLI_GE_NOT_INITIALIZED;
  }

  auto graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjsByPriority();
  GELOGI("optimize by opskernel in original graph optimize phase. num of graph_optimizer is %lu.",
         graph_optimizer.size());
  string exclude_core_Type = (core_type_ == kVectorCore) ? kAicoreEngine : kVectorEngine;
  if (graph_optimizer.size() != 0) {
    for (auto iter = graph_optimizer.begin(); iter != graph_optimizer.end(); ++iter) {
      if (iter->first == exclude_core_Type) {
        GELOGI("[OptimizeOriginalGraphJudgeInsert]: engine type will exclude: %s", exclude_core_Type.c_str());
        continue;
      }
      GELOGI("Begin to refine running format by engine %s", iter->first.c_str());
      ret = (iter->second)->OptimizeOriginalGraphJudgeInsert(*compute_graph);
      if (ret != SUCCESS) {
        GELOGE(ret, "[OptimizeOriginalGraphJudgeInsert]: graph optimize failed, ret:%d", ret);
        return ret;
      }
    }
  }
  return ret;
 }

 Status GraphOptimize::NewOptimizeOriginalGraph(ComputeGraphPtr &compute_graph) {
  GELOGD("NewOptimizeOriginalGraph in");
  if (compute_graph == nullptr) {
@@ -168,7 +199,7 @@ Status GraphOptimize::NewOptimizeOriginalGraph(ComputeGraphPtr &compute_graph) {
    return GE_CLI_GE_NOT_INITIALIZED;
  }

  std::map<string, GraphOptimizerPtr> graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjs();
  auto graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjsByPriority();
  GELOGI("optimize by opskernel in original graph optimize phase. num of graph_optimizer is %lu.",
         graph_optimizer.size());
  string exclude_core_Type = (core_type_ == kVectorCore) ? kAicoreEngine : kVectorEngine;
@@ -207,7 +238,7 @@ Status GraphOptimize::OptimizeOriginalGraphForQuantize(ComputeGraphPtr &compute_
    return GE_CLI_GE_NOT_INITIALIZED;
  }

  std::map<string, GraphOptimizerPtr> graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjs();
  auto graph_optimizer = instance_ptr->OpsKernelManagerObj().GetAllGraphOptimizerObjsByPriority();
  GELOGI("optimize by opskernel in original graph optimize quantize phase. num of graph_optimizer is %zu.",
         graph_optimizer.size());
  Status ret = SUCCESS;
--- a/src/ge/graph/optimize/graph_optimize.h
+++ b/src/ge/graph/optimize/graph_optimize.h
@@ -47,6 +47,8 @@ class GraphOptimize {
  // original graph optimize
  Status OptimizeOriginalGraph(ComputeGraphPtr &compute_graph);

  Status OptimizeOriginalGraphJudgeInsert(ComputeGraphPtr &compute_graph);

  // new original graph optimize
  Status NewOptimizeOriginalGraph(ComputeGraphPtr &compute_graph);

--- a/src/ge/graph/partition/dynamic_shape_partition.cc
+++ b/src/ge/graph/partition/dynamic_shape_partition.cc
@@ -43,39 +43,44 @@
 #define REQUIRE_SUCCESS(cond, ...) REQUIRE(((cond) == SUCCESS), __VA_ARGS__)
 #define REQUIRE_GRAPH_SUCCESS(cond, ...) REQUIRE(((cond) == GRAPH_SUCCESS), __VA_ARGS__)

 namespace {
 const bool kDebugging = (std::getenv("DEBUG_DYNAMIC_PARTITION") != nullptr);
 }  // namespace
 bool IsExperimental() {
  const static bool kIsExperimental = (std::getenv("EXPERIMENTAL_DYNAMIC_PARTITION") != nullptr);
  return kIsExperimental;
 }

 #define DLOG() \
  if (kDebugging) std::cerr
 namespace ge {
 using Cluster = DynamicShapePartitioner::Cluster;
 using ClusterPtr = std::shared_ptr<Cluster>;

 Status DynamicShapePartitioner::Partition() {
  REQUIRE_NOT_NULL(root_graph_, "Graph is nullptr.");
  DLOG() << "Start dynamic shape partition graph " << root_graph_->GetName() << std::endl;
  REQUIRE_SUCCESS(MarkUnknowShapeNodes(), "Failed mark unknow shape nodes.");
  if (!IsExperimental()) {
    GELOGD("Skip dynamic shape partition as not in experimental mode.");
    REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false),
            "Failed set dynamic shape partitioned flag on root graph.");
    return SUCCESS;
  }

  GELOGD("Start dynamic shape partition graph %s.", root_graph_->GetName().c_str());
  REQUIRE_SUCCESS(MarkUnknownShapeNodes(), "Failed mark unknown shape nodes.");
  if (unknown_shape_nodes_.empty()) {
    DLOG() << "Skip dynamic shape partition of graph " << root_graph_->GetName() << " as all nodes are known shape."
           << std::endl;
    REQUIRE(AttrUtils::SetBool(*root_graph_, "_dynamic_shape_partitioned", false),
    GELOGD("Skip dynamic shape partition of graph %s as all nodes are known shape.", root_graph_->GetName().c_str());
    REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, false),
            "Failed set dynamic shape partitioned flag on root graph.");
    return SUCCESS;
  }
  REQUIRE(AttrUtils::SetBool(*root_graph_, "_dynamic_shape_partitioned", true),
  REQUIRE(AttrUtils::SetBool(*root_graph_, ATTR_NAME_DYNAMIC_SHAPE_PARTITIONED, true),
          "Failed set dynamic shape partitioned flag on root graph.");

  DumpGraph("_Before_DSP");
  auto status = PartitionImpl();
  DLOG() << DebugString() << std::endl;
  GELOGD("%s.", DebugString().c_str());
  if (status != SUCCESS) {
    GELOGE(status, "Failed dynamic shape partition graph: %s, status:\n %s", root_graph_->GetName().c_str(),
           DebugString().c_str());
  }
  DumpGraph("_After_DSP");
  DLOG() << (status == SUCCESS ? "Succeed" : "Failed") << " dynamic shape partition graph " << root_graph_->GetName()
         << std::endl;
  GELOGD("Finish dynamic shape partition graph %s.", root_graph_->GetName().c_str());
  ClearResource();
  return status;
 }
@@ -122,36 +127,36 @@ Status DynamicShapePartitioner::BuildPartitionSubgraph() {
  return SUCCESS;
 }

 std::string DynamicShapePartitioner::DebugString() {
  size_t unknow = 0;
  size_t know = 0;
 std::string DynamicShapePartitioner::DebugString() const {
  size_t unknown = 0;
  size_t known = 0;
  size_t data = 0;
  size_t netoutput = 0;
  std::stringstream ss;
  ss << "All unknow shape nodes:" << std::endl;
  ss << "All unknown shape nodes:" << std::endl;
  for (auto node : unknown_shape_nodes_) {
    ss << "  [" << node->GetName() << "](" << node->GetType() << ")" << std::endl;
  }
  for (auto cluster : unique_clusters_) {
    if (cluster->IsUnknowShape()) {
      unknow++;
    } else if (cluster->IsKnowShape()) {
      know++;
    if (cluster->IsUnknownShape()) {
      unknown++;
    } else if (cluster->IsKnownShape()) {
      known++;
    } else if (cluster->IsData()) {
      data++;
    } else if (cluster->IsNetOutput()) {
      netoutput++;
    }
  }
  ss << "All clusters:" << unique_clusters_.size() << ", data:" << data << ", know:" << know << ", unknow:" << unknow
     << ", netoutput:" << netoutput << std::endl;
  ss << "All clusters:" << unique_clusters_.size() << ", data:" << data << ", known:" << known
     << ", unknown:" << unknown << ", netoutput:" << netoutput << std::endl;
  for (auto cluster : unique_clusters_) {
    ss << "  " << cluster->DebugString() << std::endl;
  }
  return ss.str();
 }

 void DynamicShapePartitioner::DumpGraph(std::string suffix) {
 void DynamicShapePartitioner::DumpGraph(const std::string &suffix) {
  GraphUtils::DumpGEGraphToOnnx(*root_graph_, root_graph_->GetName() + suffix);
  for (auto sub_graph : root_graph_->GetAllSubgraphs()) {
    GraphUtils::DumpGEGraphToOnnx(*sub_graph, sub_graph->GetName() + suffix);
@@ -169,10 +174,10 @@ void DynamicShapePartitioner::ClearResource() {
  root_graph_.reset();
 }

 Status DynamicShapePartitioner::MarkUnknowShapeNodes() {
 Status DynamicShapePartitioner::MarkUnknownShapeNodes() {
  auto graph = root_graph_;
  for (auto &node : graph->GetDirectNode()) {
    REQUIRE_SUCCESS(CollectSpreadUnknowShapeNodes(node), "Failed collect spread unknow shape nodes %s.",
    REQUIRE_SUCCESS(CollectSpreadUnknownShapeNodes(node), "Failed collect spread unknown shape nodes %s.",
                    node->GetName().c_str());
  }
  return SUCCESS;
@@ -188,14 +193,14 @@ Status DynamicShapePartitioner::InitClusters() {
    } else if (node->GetType() == NETOUTPUT) {
      type = Cluster::NETOUTPUT;
    } else if (unknown_shape_nodes_.count(node) > 0) {
      type = Cluster::UNKNOW_SHAPE;
      type = Cluster::UNKNOWN_SHAPE;
    } else {
      type = Cluster::KNOW_SHAPE;
      type = Cluster::KNOWN_SHAPE;
    }
    auto cluster = MakeShared<Cluster>(rank++, type, node, this);
    REQUIRE_NOT_NULL(cluster, "Failed new memory for cluster.");
    node_2_cluster_[node] = cluster;
    if (cluster->IsUnknowShape()) {
    if (cluster->IsUnknownShape()) {
      ordered_cluster_.push_back(cluster);
    }
    // Already sorted topologically, so access to the parent cluster is safe
@@ -203,18 +208,15 @@ Status DynamicShapePartitioner::InitClusters() {
      cluster->AddInput(node_2_cluster_[parent]);
    }
  }
  if (kDebugging) {
    for (const auto node : graph->GetDirectNode()) {
      DLOG() << "Make cluster for node :" << node->GetName() << ":" << node_2_cluster_[node]->DebugString()
             << std::endl;
    }
  for (const auto node : graph->GetDirectNode()) {
    GELOGD("Make cluster for node %s : %s.", node->GetName().c_str(), node_2_cluster_[node]->DebugString().c_str());
  }
  return SUCCESS;
 }

 Status DynamicShapePartitioner::TopologicalSortClusters() {
  ordered_cluster_.clear();
  // BFS topological sort clusters for know shape cluster
  // BFS topological sort clusters for known shape cluster
  std::queue<ClusterPtr> ready_clusters;
  std::unordered_map<ClusterPtr, size_t> cluster_pending_count;
  std::unordered_set<ClusterPtr> seen_clusters;
@@ -231,16 +233,17 @@ Status DynamicShapePartitioner::TopologicalSortClusters() {
      cluster_pending_count[cluster] = pending_count;
    }
  }

  size_t rank = 0;
  while (!ready_clusters.empty()) {
    auto cluster = ready_clusters.front();
    ready_clusters.pop();
    cluster->UpdateRank(rank++);
    if (cluster->IsKnowShape()) {
    if (cluster->IsKnownShape()) {
      ordered_cluster_.push_back(cluster);
    }
    for (auto out_cluster : cluster->Outputs()) {
      if (--cluster_pending_count[out_cluster] == 0) {
      if (cluster_pending_count[out_cluster] > 0 && --cluster_pending_count[out_cluster] == 0) {
        ready_clusters.push(out_cluster);
      }
    }
@@ -252,49 +255,58 @@ Status DynamicShapePartitioner::TopologicalSortClusters() {
 }

 namespace {
 template <typename T>
 static std::string ToString(T vec) {
  if (vec.empty()) {
 static std::string ToString(const std::vector<ClusterPtr> &clusters) {
  if (clusters.empty()) {
    return "()";
  }
  std::stringstream ss;
  ss << "(";
  auto iter = vec.begin();
  for (size_t i = 0; i < vec.size() - 1; i++) {
    ss << (*iter++)->Id() << ",";
  auto iter = clusters.begin();
  for (size_t i = 0; i < clusters.size() - 1; i++) {
    ss << (*iter)->Id() << ",";
    iter++;
  }
  ss << (*iter++)->Id() << ").";
  ss << (*iter)->Id() << ").";
  return ss.str();
 }
 }  // namespace

 Status DynamicShapePartitioner::MergeClusters() {
  // Merge unknow shape clusters
  // Merge unknown shape clusters
  for (auto cluster : ordered_cluster_) {
    for (auto in_cluster : cluster->Inputs()) {
      if (in_cluster->IsUnknowShape()) {
        auto merged_clusters = cluster->MergeAllPathFrom(in_cluster);
        DLOG() << "Merge all path cluster from " << in_cluster->Id() << " to " << cluster->Id()
               << ToString(merged_clusters) << std::endl;
        for (auto merged_cluster : merged_clusters) {
          for (auto node : merged_cluster->Nodes()) {
            node_2_cluster_[node] = cluster;
          }
      if (!in_cluster->IsUnknownShape()) {
        continue;
      }
      auto merged_clusters = cluster->MergeAllPathFrom(in_cluster);
      GELOGD("Merge all path cluster from %lu to %lu %s.", in_cluster->Id(), cluster->Id(),
             ToString(merged_clusters).c_str());
      for (auto merged_cluster : merged_clusters) {
        for (auto node : merged_cluster->Nodes()) {
          node_2_cluster_[node] = cluster;
        }
      }
    }
  }
  REQUIRE_SUCCESS(TopologicalSortClusters(), "Failed topological sort clusters after merge unknow shape clusters.");
  // Merge know shape clusters

  REQUIRE_SUCCESS(TopologicalSortClusters(), "Failed topological sort clusters after merge unknown shape clusters.");
  // Merge known shape clusters
  for (auto cluster : ordered_cluster_) {
    if (cluster->IsRefVariable() && cluster->Inputs().size() == 1) {
      auto in_cluster = *(cluster->Inputs().begin());
      in_cluster->Merge(cluster);
      node_2_cluster_[*(cluster->Nodes().begin())] = in_cluster;
      continue;
    }

    for (auto in_cluster : cluster->Inputs()) {
      if (in_cluster->IsKnowShape()) {
        if (cluster->TryMerge(in_cluster)) {
          DLOG() << "Success merge known shape cluster " << in_cluster->Id() << " to " << cluster->Id() << "."
                 << std::endl;
          for (auto node : in_cluster->Nodes()) {
            node_2_cluster_[node] = cluster;
          }
      if (!in_cluster->IsKnownShape()) {
        continue;
      }
      if (cluster->TryMerge(in_cluster)) {
        GELOGD("Success merge known shape cluster from %lu to %lu.", in_cluster->Id(), cluster->Id());
        for (auto node : in_cluster->Nodes()) {
          node_2_cluster_[node] = cluster;
        }
      }
    }
@@ -302,23 +314,30 @@ Status DynamicShapePartitioner::MergeClusters() {
  return SUCCESS;
 }

 Status DynamicShapePartitioner::CollectSpreadUnknowShapeNodes(NodePtr node) {
 Status DynamicShapePartitioner::CollectSpreadUnknownShapeNodes(NodePtr node) {
  if (unknown_shape_nodes_.count(node) > 0) {
    return SUCCESS;
  }
  auto opdesc = node->GetOpDesc();
  // One can set 'ATTR_NAME_IS_UNKNOWN_SHAPE=true' on node so as to forcing the node flow into the unknown subgraph,
  // ignore the actual shape.
  bool is_forced_unknown = false;
  if (AttrUtils::GetBool(opdesc, ATTR_NAME_IS_UNKNOWN_SHAPE, is_forced_unknown) && is_forced_unknown) {
    GELOGD("Collect node %s as unknown as it was marked unknown forcibly.", node->GetName().c_str());
    unknown_shape_nodes_.insert(node);
    return SUCCESS;
  }
  size_t anchor_index = 0;
  bool is_unknow = false;
  bool is_unknown = false;
  for (auto &out_tensor : opdesc->GetAllOutputsDesc()) {
    if (IsUnknowShapeTensor(out_tensor)) {
      DLOG() << "Collect node " << node->GetName() << " as unknown as output " << anchor_index << " is unknown"
             << std::endl;
      is_unknow = true;
    if (IsUnknownShapeTensor(out_tensor)) {
      GELOGD("Collect node %s as unknown as output %lu is unknown.", node->GetName().c_str(), anchor_index);
      is_unknown = true;
      auto anchor = node->GetOutDataAnchor(anchor_index);
      for (const auto peer_anchor : anchor->GetPeerInDataAnchors()) {
        if (peer_anchor != nullptr) {
          DLOG() << "Collect node " << peer_anchor->GetOwnerNode()->GetName() << " as has unknown input from "
                 << node->GetName() << ":" << anchor_index << std::endl;
          GELOGD("Collect node %s as has unknown input from %s:%lu.", peer_anchor->GetOwnerNode()->GetName().c_str(),
                 node->GetName().c_str(), anchor_index);
          unknown_shape_nodes_.insert(peer_anchor->GetOwnerNode());
        }
      }
@@ -327,21 +346,20 @@ Status DynamicShapePartitioner::CollectSpreadUnknowShapeNodes(NodePtr node) {
  }
  anchor_index = 0;
  for (auto &in_tensor : opdesc->GetAllInputsDesc()) {
    if (IsUnknowShapeTensor(in_tensor)) {
      DLOG() << "Collect node " << node->GetName() << " as unknown as input " << anchor_index << " is unknown"
             << std::endl;
      is_unknow = true;
    if (IsUnknownShapeTensor(in_tensor)) {
      GELOGD("Collect node %s as unknown as input %lu is unknown.", node->GetName().c_str(), anchor_index);
      is_unknown = true;
      auto anchor = node->GetInDataAnchor(anchor_index);
      const auto peer_anchor = anchor->GetPeerOutAnchor();
      if (peer_anchor != nullptr) {
        DLOG() << "Collect node " << peer_anchor->GetOwnerNode()->GetName() << " as has unknown output to "
               << node->GetName() << ":" << anchor_index << std::endl;
        GELOGD("Collect node %s as has unknown output to %s:%lu.", peer_anchor->GetOwnerNode()->GetName().c_str(),
               node->GetName().c_str(), anchor_index);
        unknown_shape_nodes_.insert(peer_anchor->GetOwnerNode());
      }
    }
    anchor_index++;
  }
  if (is_unknow) {
  if (is_unknown) {
    unknown_shape_nodes_.insert(node);
  } else {
    auto graph = root_graph_;
@@ -350,11 +368,10 @@ Status DynamicShapePartitioner::CollectSpreadUnknowShapeNodes(NodePtr node) {
      REQUIRE_NOT_NULL(subgraph, "Failed get subgraph %s of node %s on root graph.", subgraph_name.c_str(),
                       node->GetName().c_str());
      bool is_graph_unknow = false;
      REQUIRE_SUCCESS(IsUnknowShapeGraph(subgraph, is_graph_unknow), "Failed check subgraph %s shape of node %s.",
      REQUIRE_SUCCESS(IsUnknownShapeGraph(subgraph, is_graph_unknow), "Failed check subgraph %s shape of node %s.",
                      subgraph_name.c_str(), node->GetName().c_str());
      if (is_graph_unknow) {
        DLOG() << "Collect node " << node->GetName() << " as its subgraph " << subgraph->GetName() << " is unknown."
               << std::endl;
        GELOGD("Collect node %s as its subgraph %s is unknown.", node->GetName().c_str(), subgraph->GetName().c_str());
        unknown_shape_nodes_.insert(node);
        break;
      }
@@ -363,20 +380,20 @@ Status DynamicShapePartitioner::CollectSpreadUnknowShapeNodes(NodePtr node) {
  return SUCCESS;
 }

 Status DynamicShapePartitioner::IsUnknowShapeNode(NodePtr node, bool &is_unknow) {
 Status DynamicShapePartitioner::IsUnknownShapeNode(NodePtr node, bool &is_unknown) {
  auto opdesc = node->GetOpDesc();
  auto graph = root_graph_;
  for (auto &out_tensor : opdesc->GetAllOutputsDesc()) {
    if (IsUnknowShapeTensor(out_tensor)) {
      DLOG() << "Mark node " << node->GetName() << " unknown because unknown output " << std::endl;
      is_unknow = true;
    if (IsUnknownShapeTensor(out_tensor)) {
      GELOGD("Mark node %s unknown as unknown output.", node->GetName().c_str());
      is_unknown = true;
      return SUCCESS;
    }
  }
  for (auto &in_tensor : opdesc->GetAllInputsDesc()) {
    if (IsUnknowShapeTensor(in_tensor)) {
      DLOG() << "Mark node " << node->GetName() << " unknown because unknown intput " << std::endl;
      is_unknow = true;
    if (IsUnknownShapeTensor(in_tensor)) {
      GELOGD("Mark node %s unknown as unknown intput.", node->GetName().c_str());
      is_unknown = true;
      return SUCCESS;
    }
  }
@@ -384,30 +401,30 @@ Status DynamicShapePartitioner::IsUnknowShapeNode(NodePtr node, bool &is_unknow)
    auto subgraph = graph->GetSubgraph(subgraph_name);
    REQUIRE_NOT_NULL(subgraph, "Failed get subgraph %s of node %s on root graph.", subgraph_name.c_str(),
                     node->GetName().c_str());
    REQUIRE_SUCCESS(IsUnknowShapeGraph(subgraph, is_unknow), "Failed check subgraph %s shape of node %s.",
    REQUIRE_SUCCESS(IsUnknownShapeGraph(subgraph, is_unknown), "Failed check subgraph %s shape of node %s.",
                    subgraph_name.c_str(), node->GetName().c_str());
    if (is_unknow) {
      DLOG() << "Mark node " << node->GetName() << " unknown because unknown subgraph " << std::endl;
    if (is_unknown) {
      GELOGD("Mark node %s unknown as unknown subgraph.", node->GetName().c_str());
      return SUCCESS;
    }
  }
  is_unknow = false;
  is_unknown = false;
  return SUCCESS;
 }

 Status DynamicShapePartitioner::IsUnknowShapeGraph(ComputeGraphPtr graph, bool &is_unknow) {
 Status DynamicShapePartitioner::IsUnknownShapeGraph(ComputeGraphPtr graph, bool &is_unknown) {
  for (auto &node : graph->GetDirectNode()) {
    REQUIRE_SUCCESS(IsUnknowShapeNode(node, is_unknow), "Failed check node %s shape on graph %s.",
    REQUIRE_SUCCESS(IsUnknownShapeNode(node, is_unknown), "Failed check node %s shape on graph %s.",
                    node->GetName().c_str(), graph->GetName().c_str());
    if (is_unknow) {
      DLOG() << "Mark graph " << graph->GetName() << " unknown because unknown node " << node->GetName() << std::endl;
    if (is_unknown) {
      GELOGD("Mark graph %s unknown as contains unknown node %s.", graph->GetName().c_str(), node->GetName().c_str());
      return SUCCESS;
    }
  }
  return SUCCESS;
 }

 bool DynamicShapePartitioner::IsUnknowShapeTensor(GeTensorDesc &tensor) {
 bool DynamicShapePartitioner::IsUnknownShapeTensor(const GeTensorDesc &tensor) {
  const static int kUnknowShape = -1;
  const static int kUnknowRank = -2;
  for (auto dim_size : tensor.GetShape().GetDims()) {
@@ -418,7 +435,7 @@ bool DynamicShapePartitioner::IsUnknowShapeTensor(GeTensorDesc &tensor) {
  return false;
 }

 std::string Cluster::DebugString() {
 std::string Cluster::DebugString() const {
  std::stringstream ss;
  switch (type_) {
    case DATA:
@@ -427,10 +444,10 @@ std::string Cluster::DebugString() {
    case NETOUTPUT:
      ss << "NETOUTPUT";
      break;
    case UNKNOW_SHAPE:
    case UNKNOWN_SHAPE:
      ss << "UNKNOW";
      break;
    case KNOW_SHAPE:
    case KNOWN_SHAPE:
      ss << "KNOW";
      break;
  }
@@ -450,18 +467,22 @@ std::string Cluster::DebugString() {
  return ss.str();
 }

 size_t Cluster::Id() { return id_; }
 size_t Cluster::Id() const { return id_; }
 void Cluster::UpdateRank(size_t rank) {
  max_ = rank;
  min_ = rank;
 };
 bool Cluster::IsData() { return type_ == DATA; };
 bool Cluster::IsKnowShape() { return type_ == KNOW_SHAPE; };
 bool Cluster::IsUnknowShape() { return type_ == UNKNOW_SHAPE; };
 bool Cluster::IsNetOutput() { return type_ == NETOUTPUT; };
 bool Cluster::IsolatedConstant() {
  return ((nodes_.size() == 1) && (nodes_[0]->GetType() == CONSTANTOP) && (out_clusters_.size() == 1) &&
          (*out_clusters_.begin())->IsUnknowShape() && in_clusters_.empty());
 bool Cluster::IsData() const { return type_ == DATA; };
 bool Cluster::IsKnownShape() const { return type_ == KNOWN_SHAPE; };
 bool Cluster::IsUnknownShape() const { return type_ == UNKNOWN_SHAPE; };
 bool Cluster::IsNetOutput() const { return type_ == NETOUTPUT; };
 bool Cluster::IsRefVariable() const {
  if ((nodes_.size() == 1) && ((nodes_[0]->GetType() == VARIABLE) || (nodes_[0]->GetType() == VARIABLEV2))) {
    std::string ref_variable_name;
    return (AttrUtils::GetStr(nodes_[0]->GetOpDesc(), REF_VAR_SRC_VAR_NAME, ref_variable_name) &&
            !ref_variable_name.empty());
  }
  return false;
 }
 void Cluster::AddInput(ClusterPtr in) {
  in_clusters_.insert(in);
@@ -562,9 +583,9 @@ std::vector<ClusterPtr> Cluster::MergeAllPathFrom(ClusterPtr other) {
  }
  return path_clusters;
 }
 std::unordered_set<ClusterPtr> Cluster::Inputs() { return in_clusters_; };
 std::unordered_set<ClusterPtr> Cluster::Outputs() { return out_clusters_; };
 std::vector<NodePtr> Cluster::Nodes() { return nodes_; };
 std::unordered_set<ClusterPtr> Cluster::Inputs() const { return in_clusters_; };
 std::unordered_set<ClusterPtr> Cluster::Outputs() const { return out_clusters_; };
 std::vector<NodePtr> Cluster::Nodes() const { return nodes_; };

 void Cluster::AddFrameInput(InDataAnchorPtr anchor) {
  inputs_index_[anchor] = inputs_.size();
@@ -589,7 +610,7 @@ InControlAnchorPtr Cluster::GetFrameInControlAnchor() { return partition_node_->
 OutControlAnchorPtr Cluster::GetFrameOutControlAnchor() { return partition_node_->GetOutControlAnchor(); };

 Status Cluster::BuildFrame() {
  if (IsUnknowShape() || IsKnowShape()) {
  if (IsUnknownShape() || IsKnownShape()) {
    return BuildPartitionFrame();
  } else {
    auto node = nodes_.front();
@@ -621,7 +642,7 @@ Status Cluster::BuildFrame() {

 Status Cluster::BuildPartitionFrame() {
  auto graph = partitioner_->root_graph_;
  bool is_unknown_shape = IsUnknowShape();
  bool is_unknown_shape = IsUnknownShape();
  std::string sub_graph_name =
    graph->GetName() + "_sub_" + std::to_string(unique_id_) + (is_unknown_shape ? "_unknow" : "_know");
  subgraph_ = MakeShared<ComputeGraph>(sub_graph_name);
@@ -727,6 +748,7 @@ Status Cluster::BuildPartitionSubgraph() {
    auto data_op = MakeShared<OpDesc>(std::string("Data_") + std::to_string(parent_node_index), ge::DATA);
    REQUIRE_NOT_NULL(data_op, "Failed new memory for data op.");
    auto input_desc = anchor->GetOwnerNode()->GetOpDesc()->GetInputDesc(anchor->GetIdx());
    REQUIRE_GRAPH_SUCCESS(data_op->AddInputDesc(input_desc), "Failed add input desc.");
    REQUIRE_GRAPH_SUCCESS(data_op->AddOutputDesc(input_desc), "Failed add output desc.");
    REQUIRE(AttrUtils::SetInt(data_op, ATTR_NAME_PARENT_NODE_INDEX, parent_node_index),
            "Failed set parent_node_index on subgraph data node.");
--- a/src/ge/graph/partition/dynamic_shape_partition.h
+++ b/src/ge/graph/partition/dynamic_shape_partition.h
@@ -29,27 +29,27 @@ class DynamicShapePartitioner {
 public:
  // An cluster means set of nodes that can be merged in same partition,
  // Corresponding relationship between cluster type and node:
  // DATA:DATA, UNKNOW_SHAPE:unknowshape, KNOW_SHAPE:knowshape, NETOUTPUT:NETOUTPUT.
  // DATA:DATA, UNKNOWN_SHAPE:unknowshape, KNOWN_SHAPE:knowshape, NETOUTPUT:NETOUTPUT.
  class Cluster : public std::enable_shared_from_this<Cluster> {
   public:
    enum Type { DATA, NETOUTPUT, KNOW_SHAPE, UNKNOW_SHAPE };
    explicit Cluster(size_t rank, Type type, NodePtr node, DynamicShapePartitioner *partitioner)
    enum Type { DATA, NETOUTPUT, KNOWN_SHAPE, UNKNOWN_SHAPE };
    Cluster(size_t rank, Type type, NodePtr node, DynamicShapePartitioner *partitioner)
        : id_(rank), min_(rank), max_(rank), type_(type), partitioner_(partitioner) {
      nodes_.push_back(node);
    }
    ~Cluster() = default;
    std::string DebugString();
    std::string DebugString() const;
    // Basic bean functions
    size_t Id();
    size_t Id() const;
    void UpdateRank(size_t rank);
    bool IsData();
    bool IsKnowShape();
    bool IsUnknowShape();
    bool IsNetOutput();
    std::unordered_set<std::shared_ptr<Cluster>> Inputs();
    std::unordered_set<std::shared_ptr<Cluster>> Outputs();
    std::vector<NodePtr> Nodes();
    bool IsolatedConstant();
    bool IsData() const;
    bool IsKnownShape() const;
    bool IsUnknownShape() const;
    bool IsNetOutput() const;
    std::unordered_set<std::shared_ptr<Cluster>> Inputs() const;
    std::unordered_set<std::shared_ptr<Cluster>> Outputs() const;
    std::vector<NodePtr> Nodes() const;
    bool IsRefVariable() const;
    // Cluster modify functions
    void AddInput(std::shared_ptr<Cluster> in);
    void RemoveInput(std::shared_ptr<Cluster> in);
@@ -110,16 +110,16 @@ class DynamicShapePartitioner {
  // Collect nodes that satisfy the unknowshape rules:
  // 1) The Tensor shape of any input or output is unknow shape(dim_size = -1) or unknow rank(dim_size=-2)
  // 2) Subgraphs of the node has an operator that satisfies rule 1)
  Status MarkUnknowShapeNodes();
  Status MarkUnknownShapeNodes();
  // For each node a Cluster structure, and connected according to the connection relationship of the nodes
  // An cluster means set of nodes that can be merged in same partition,
  // Corresponding relationship between cluster type and node:
  // DATA:DATA, UNKNOW_SHAPE:unknowshape, KNOW_SHAPE:knowshape, NETOUTPUT:NETOUTPUT
  // DATA:DATA, UNKNOWN_SHAPE:unknowshape, KNOWN_SHAPE:knowshape, NETOUTPUT:NETOUTPUT
  Status InitClusters();
  // Merge clusters according to the following rules:
  // 1) Iterate through the UNKNOW_SHAPE clusters, if the input is UNKNOW_SHAPE,
  // 1) Iterate through the UNKNOWN_SHAPE clusters, if the input is UNKNOWN_SHAPE,
  //    merge all the clusters in the path(s) between the two clusters
  // 2) Iterate through the KNOW_SHAPE clusters, if the input is KNOW_SHAPE, and
  // 2) Iterate through the KNOWN_SHAPE clusters, if the input is KNOWN_SHAPE, and
  //    and there's only one path between the two clusters , merge the two clusters
  Status MergeClusters();
  // Topological sort clusters after merge unknow shape clusters.
@@ -135,18 +135,18 @@ class DynamicShapePartitioner {
  // Clear resource and break circular dependency
  void ClearResource();
  // Debug functions
  void DumpGraph(std::string suffix);
  std::string DebugString();
  void DumpGraph(const std::string &suffix);
  std::string DebugString() const;
  // Util functions
  Status CollectSpreadUnknowShapeNodes(NodePtr node);
  Status IsUnknowShapeGraph(ge::ComputeGraphPtr graph, bool &is_unknow);
  Status IsUnknowShapeNode(ge::NodePtr node, bool &is_unknow);
  bool IsUnknowShapeTensor(ge::GeTensorDesc &tensor);
  Status CollectSpreadUnknownShapeNodes(NodePtr node);
  Status IsUnknownShapeGraph(ge::ComputeGraphPtr graph, bool &is_unknow);
  Status IsUnknownShapeNode(ge::NodePtr node, bool &is_unknow);
  bool IsUnknownShapeTensor(const ge::GeTensorDesc &tensor);
  ge::ComputeGraphPtr root_graph_;                                        // The original graph to partition
  std::unordered_map<NodePtr, std::shared_ptr<Cluster>> node_2_cluster_;  // Record nodes and the cluster it belongs to
  // topological sorted clusters, this field will change with the splitting.
  // When partitioning UNKNOW_SHAPE cluster, it is a collection of all topological sorted UNKNOW_SHAPE clusters
  // When partitioning KNOW_SHAPE cluster, it is a collection of all topological sorted KNOW_SHAPE clusters
  // When partitioning UNKNOWN_SHAPE cluster, it is a collection of all topological sorted UNKNOWN_SHAPE clusters
  // When partitioning KNOWN_SHAPE cluster, it is a collection of all topological sorted KNOWN_SHAPE clusters
  std::vector<std::shared_ptr<Cluster>> ordered_cluster_;
  // Unique clusters left after merged clusters
  std::unordered_set<std::shared_ptr<Cluster>> unique_clusters_;